From 3c5b46d3ca41ee6df5092845de647e1b32cb6fb6 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Wed, 1 Aug 2018 16:54:33 -0700 Subject: [PATCH 001/160] Updatng release history --- README.md | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 18e50ebe3..a822f6f97 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,35 @@ -# Docker Monitoring Agent for OMI Server +# AKS Container Health monitoring -### Code of Conduct +## Code of Conduct This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Release History + +### 7/31/2018 - Version microsoft/oms:ciprod07312018 +- Changes for node lost scenario (roll-up pod & container statuses as Unknown) +- Discover unscheduled pods +- KubeNodeInventory - delimit multiple true node conditions for node status +- UTF Encoding support for container logs +- Container environment variable truncated to 200K +- Handle json parsing errors for OMI provider for docker +- Test mode enablement for ACS-engine testing +- Latest OMS agent (1.6.0-163) +- Latest OMI (1.4.2.5) + + +### 6/7/2018 - Version microsoft/oms:ciprod06072018 +- Remove node-0 dependency +- Remove passing WSID & Key as environment variables and pass them as kubernetes secret (for non-AKS; we already pass them as secret for AKS) +- Please note that if you are manually deploying thru yaml you need to - +- Provide workspaceid & key as base64 encoded strings with in double quotes (.yaml has comments to do so as well) +- Provide cluster name twice (for each container – daemonset & replicaset) + +### 5/8/2018 - Version microsoft/oms:ciprod05082018 +- Kubernetes RBAC enablement +- Latest released omsagent (1.6.0-42) +- Bug fix so that we do not collect kube-system namespace container logs when kube api calls fail occasionally (Bug #215107) +- .yaml changes (for RBAC) From d31f5889ec2f9ff6981efc72f2166b0430bffae9 Mon Sep 17 00:00:00 2001 From: rashmy Date: Wed, 1 Aug 2018 16:52:40 -0700 Subject: [PATCH 002/160] fixing the plugin logs for emit stream --- source/code/plugin/in_cadvisor_perf.rb | 4 ++-- source/code/plugin/in_kube_nodes.rb | 7 ++++--- source/code/plugin/in_kube_podinventory.rb | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index 01f2fa9f4..2e28650f6 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -55,10 +55,10 @@ def enumerate() end router.emit_stream(@tag, eventStream) if eventStream - if (ENV['ISTEST'] == true && eventStream.count > 0) + @@istestvar = ENV['ISTEST'] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) $log.info("in_cadvisor_perf::emit-stream : Success @ #{Time.now.utc.iso8601}") end - rescue => errorStr $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 473978cbc..6cbad0897 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -99,9 +99,10 @@ def enumerate eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream - if (ENV['ISTEST'] == true && eventStream.count > 0) - $log.info("in_kube_nodeinventory::emit-stream : Success @ #{Time.now.utc.iso8601}") - end + @@istestvar = ENV['ISTEST'] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + $log.info("in_kube_nodeinventory::emit-stream : Success @ #{Time.now.utc.iso8601}") + end end rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index a96a0b207..656d1aa48 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -190,7 +190,8 @@ def parse_and_emit_records(podInventory, serviceList) end end #podInventory block end router.emit_stream(@tag, eventStream) if eventStream - if (ENV['ISTEST'] == true && eventStream.count > 0) + @@istestvar = ENV['ISTEST'] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) $log.info("in_kube_podinventory::emit-stream : Success @ #{Time.now.utc.iso8601}") end rescue => errorStr From 11fd5f6d4e3dd0b4fe57c8f4a551d1da4e8fa41f Mon Sep 17 00:00:00 2001 From: rashmy Date: Sun, 5 Aug 2018 00:37:52 -0700 Subject: [PATCH 003/160] updating log message --- source/code/plugin/in_cadvisor_perf.rb | 2 +- source/code/plugin/in_kube_nodes.rb | 2 +- source/code/plugin/in_kube_podinventory.rb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index 2e28650f6..5b551f74e 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -57,7 +57,7 @@ def enumerate() router.emit_stream(@tag, eventStream) if eventStream @@istestvar = ENV['ISTEST'] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("in_cadvisor_perf::emit-stream : Success @ #{Time.now.utc.iso8601}") + $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end rescue => errorStr $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 6cbad0897..edbbdd37f 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -101,7 +101,7 @@ def enumerate router.emit_stream(@tag, eventStream) if eventStream @@istestvar = ENV['ISTEST'] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("in_kube_nodeinventory::emit-stream : Success @ #{Time.now.utc.iso8601}") + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end end rescue => errorStr diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 656d1aa48..f478705f6 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -192,7 +192,7 @@ def parse_and_emit_records(podInventory, serviceList) router.emit_stream(@tag, eventStream) if eventStream @@istestvar = ENV['ISTEST'] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("in_kube_podinventory::emit-stream : Success @ #{Time.now.utc.iso8601}") + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" From 87a9cf8ddb77f789a805b433ca4ff92556f7d8a0 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 16 Aug 2018 11:58:10 -0700 Subject: [PATCH 004/160] Remove Log Processing from fluentd configuration --- installer/conf/container.conf | 32 -- .../code/plugin/containerlogtailfilereader.rb | 396 ------------------ source/code/plugin/filter_container_log.rb | 42 -- 3 files changed, 470 deletions(-) delete mode 100644 source/code/plugin/containerlogtailfilereader.rb delete mode 100644 source/code/plugin/filter_container_log.rb diff --git a/installer/conf/container.conf b/installer/conf/container.conf index a20fdbe5a..9eaed9b47 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -50,18 +50,6 @@ ] -# Container log -# Example line which matches the format: -# {"log"=>"Test 9th January\n", "stream"=>"stdout", "time"=>"2018-01-09T23:14:39.273429353Z", "ContainerID"=>"ee1ec26aa974af81b21fff24cef8ec78bf7ac1558b5de6f1eb1a5b28ecd6d559", "Image"=>"ubuntu", "Name"=>"determined_wilson", "SourceSystem"=>"Containers"} -# NOTE: The LogEntryTimeStamp is just being appended in the begining of the LogEntry field. This is the actual time the log was generated and the TimeGenerated field in Kusto is different - - type containerlog_sudo_tail - pos_file /var/opt/microsoft/docker-cimprov/state/ContainerLogFile.pos.log - tag oms.container.log - format /\"log\"=>\"(?.*)", \"stream\"=>\"(?.*)", \"time\"=>\"(?.*)", \"ContainerID\"=>\"(?.*)", \"Image\"=>\"(?.*)", \"Name\"=>\"(?.*)", \"SourceSystem\"=>\"(?.*)"}/ - run_interval 60s - - # Container host inventory type omi @@ -95,11 +83,6 @@ type filter_container -# Seperate filter for container log - - type filter_container_log - - type out_oms_api log_level debug @@ -152,21 +135,6 @@ max_retry_wait 9m - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_log*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - type out_oms log_level info diff --git a/source/code/plugin/containerlogtailfilereader.rb b/source/code/plugin/containerlogtailfilereader.rb deleted file mode 100644 index 2d55b1d73..000000000 --- a/source/code/plugin/containerlogtailfilereader.rb +++ /dev/null @@ -1,396 +0,0 @@ - -require 'optparse' -require 'json' -require 'logger' -require_relative 'omslog' -require 'fluent/filter' - -module ContainerLogTailscript - - class ContainerLogNewTail - def initialize(paths) - @paths = paths - @tails = {} - @pos_file = $options[:pos_file] - @read_from_head = $options[:read_from_head] - @pf = nil - @pf_file = nil - - @log = Logger.new(STDERR) - @log.formatter = proc do |severity, time, progname, msg| - "#{severity} #{msg}\n" - end - end - - attr_reader :paths - - def start - start_watchers(@paths) unless @paths.empty? - end - - def shutdown - @pf_file.close if @pf_file - end - - def setup_watcher(path, pe) - tw = TailWatcher.new(path, pe, @read_from_head, @log, &method(:receive_lines)) - tw.on_notify - tw - end - - def start_watchers(paths) - if @pos_file - @pf_file = File.open(@pos_file, File::RDWR|File::CREAT) - @pf_file.sync = true - @pf = PositionFile.parse(@pf_file) - end - paths.each { |path| - pe = nil - if @pf - pe = @pf[path] #pe is FilePositionEntry instance - if pe.read_inode.zero? - begin - pe.update(File::Stat.new(path).ino, 0) - rescue Errno::ENOENT - @log.warn "#{path} not found. Continuing without tailing it." - end - end - end - - @tails[path] = setup_watcher(path, pe) - } - end - - def receive_lines(lines, tail_watcher) - unless lines.empty? - puts lines - end - return true - end - - class TailWatcher - def initialize(path, pe, read_from_head, log, &receive_lines) - @path = path - @pe = pe || MemoryPositionEntry.new - @read_from_head = read_from_head - @log = log - @receive_lines = receive_lines - @rotate_handler = RotateHandler.new(path, log, &method(:on_rotate)) - @io_handler = nil - @containerIDFilePath = "/var/opt/microsoft/docker-cimprov/state/ContainerInventory/" - end - - attr_reader :path - - def wrap_receive_lines(lines) - newLines = [] - containerID = @path.split('/').last.chomp('-json.log') - containerInspectInformation = @containerIDFilePath + containerID - tempContainerInfo = {} - begin - File.open(containerInspectInformation) { |f| tempContainerInfo = JSON.parse(f.readline)} - lines.each { |line| - unless line.empty? - newLine = {} - newLine = JSON.parse(line) - newLine["ContainerID"] = containerID - newLine["Image"] = tempContainerInfo["Image"] - newLine["Name"] = tempContainerInfo["ElementName"] - newLine["SourceSystem"] = "Containers" - newLines.push(newLine) - end - } - rescue Exception => e - #File doesn't exist or error in reading the data - @log.error "Caught exception when opening file -> #{e}" - end - @receive_lines.call(newLines, self) - end - - def on_notify - @rotate_handler.on_notify if @rotate_handler - return unless @io_handler - @io_handler.on_notify - end - - def on_rotate(io) - if io - # first time - stat = io.stat - fsize = stat.size - inode = stat.ino - - last_inode = @pe.read_inode - if @read_from_head - pos = 0 - @pe.update(inode, pos) - elsif inode == last_inode - # rotated file has the same inode number as the pos_file. - # seek to the saved position - pos = @pe.read_pos - elsif last_inode != 0 - # read data from the head of the rotated file. - pos = 0 - @pe.update(inode, pos) - else - # this is the first MemoryPositionEntry for the first time fluentd started. - # seeks to the end of the file to know where to start tailing - pos = fsize - @pe.update(inode, pos) - end - io.seek(pos) - @io_handler = IOHandler.new(io, @pe, @log, &method(:wrap_receive_lines)) - else - @io_handler = NullIOHandler.new - end - end - - class IOHandler - def initialize(io, pe, log, &receive_lines) - @log = log - @io = io - @pe = pe - @log = log - @read_lines_limit = 100 - @receive_lines = receive_lines - @buffer = ''.force_encoding('ASCII-8BIT') - @iobuf = ''.force_encoding('ASCII-8BIT') - @lines = [] - end - - attr_reader :io - - def on_notify - begin - read_more = false - if @lines.empty? - begin - while true - if @buffer.empty? - @io.readpartial(512, @buffer) - else - @buffer << @io.readpartial(512, @iobuf) - end - while line = @buffer.slice!(/.*?\n/m) - @lines << line - end - if @lines.size >= @read_lines_limit - # not to use too much memory in case the file is very large - read_more = true - break - end - end - rescue EOFError - end - end - - unless @lines.empty? - if @receive_lines.call(@lines) - @pe.update_pos(@io.pos - @buffer.bytesize) - @lines.clear - else - read_more = false - end - end - end while read_more - - rescue - @log.error "#{$!.to_s}" - close - end - - def close - @io.close unless @io.closed? - end - end - - class NullIOHandler - def initialize - end - - def io - end - - def on_notify - end - - def close - end - end - - class RotateHandler - def initialize(path, log, &on_rotate) - @path = path - @inode = nil - @fsize = -1 # first - @on_rotate = on_rotate - @log = log - end - - def on_notify - begin - stat = File.stat(@path) #returns a File::Stat object for the file named @path - inode = stat.ino - fsize = stat.size - rescue Errno::ENOENT - # moved or deleted - inode = nil - fsize = 0 - end - - begin - if @inode != inode || fsize < @fsize - # rotated or truncated - begin - io = File.open(@path) - rescue Errno::ENOENT - end - @on_rotate.call(io) - end - @inode = inode - @fsize = fsize - end - - rescue - @log.error "#{$!.to_s}" - end - end - end - - - class PositionFile - UNWATCHED_POSITION = 0xffffffffffffffff - - def initialize(file, map, last_pos) - @file = file - @map = map - @last_pos = last_pos - end - - def [](path) - if m = @map[path] - return m - end - - @file.pos = @last_pos - @file.write path - @file.write "\t" - seek = @file.pos - @file.write "0000000000000000\t0000000000000000\n" - @last_pos = @file.pos - - @map[path] = FilePositionEntry.new(@file, seek) - end - - def self.parse(file) - compact(file) - - map = {} - file.pos = 0 - file.each_line {|line| - m = /^([^\t]+)\t([0-9a-fA-F]+)\t([0-9a-fA-F]+)/.match(line) - next unless m - path = m[1] - seek = file.pos - line.bytesize + path.bytesize + 1 - map[path] = FilePositionEntry.new(file, seek) - } - new(file, map, file.pos) - end - - # Clean up unwatched file entries - def self.compact(file) - file.pos = 0 - existent_entries = file.each_line.map { |line| - m = /^([^\t]+)\t([0-9a-fA-F]+)\t([0-9a-fA-F]+)/.match(line) - next unless m - path = m[1] - pos = m[2].to_i(16) - ino = m[3].to_i(16) - # 32bit inode converted to 64bit at this phase - pos == UNWATCHED_POSITION ? nil : ("%s\t%016x\t%016x\n" % [path, pos, ino]) - }.compact - - file.pos = 0 - file.truncate(0) - file.write(existent_entries.join) - end - end - - # pos inode - # ffffffffffffffff\tffffffffffffffff\n - class FilePositionEntry - POS_SIZE = 16 - INO_OFFSET = 17 - INO_SIZE = 16 - LN_OFFSET = 33 - SIZE = 34 - - def initialize(file, seek) - @file = file - @seek = seek - end - - def update(ino, pos) - @file.pos = @seek - @file.write "%016x\t%016x" % [pos, ino] - end - - def update_pos(pos) - @file.pos = @seek - @file.write "%016x" % pos - end - - def read_inode - @file.pos = @seek + INO_OFFSET - raw = @file.read(INO_SIZE) - raw ? raw.to_i(16) : 0 - end - - def read_pos - @file.pos = @seek - raw = @file.read(POS_SIZE) - raw ? raw.to_i(16) : 0 - end - end - - class MemoryPositionEntry - def initialize - @pos = 0 - @inode = 0 - end - - def update(ino, pos) - @inode = ino - @pos = pos - end - - def update_pos(pos) - @pos = pos - end - - def read_pos - @pos - end - - def read_inode - @inode - end - end - end -end - -if __FILE__ == $0 - $options = {:read_from_head => false} - OptionParser.new do |opts| - opts.on("-p", "--posfile [POSFILE]") do |p| - $options[:pos_file] = p - end - opts.on("-h", "--[no-]readfromhead") do |h| - $options[:read_from_head] = h - end - end.parse! - a = ContainerLogTailscript::ContainerLogNewTail.new(ARGV) - a.start - a.shutdown -end - diff --git a/source/code/plugin/filter_container_log.rb b/source/code/plugin/filter_container_log.rb deleted file mode 100644 index 21e146a35..000000000 --- a/source/code/plugin/filter_container_log.rb +++ /dev/null @@ -1,42 +0,0 @@ -# frozen_string_literal: true - -require 'fluent/filter' - -module Fluent - require 'logger' - class PassThruFilter < Filter - Fluent::Plugin.register_filter('filter_container_log', self) - - def configure(conf) - super - end - - def start - super - @hostname = OMS::Common.get_hostname or "Unknown host" - end - - def shutdown - super - end - - def filter(tag, time, record) - begin - #Try to force utf-8 encoding on the string so that all characters can flow through to - #$log.info "before : #{record['LogEntry']}" - record['LogEntry'].force_encoding('UTF-8') - rescue - $log.error "Failed to convert record['LogEntry'] : '#{record['LogEntry']}' to UTF-8 using force_encoding." - $log.error "Current string encoding for record['LogEntry'] is #{record['LogEntry'].encoding}" - end - - record['Computer'] = @hostname - wrapper = { - "DataType"=>"CONTAINER_LOG_BLOB", - "IPName"=>"Containers", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - wrapper - end - end -end From 308be41fe87202ee6e289cc9c952a24910eed133 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 16 Aug 2018 12:01:14 -0700 Subject: [PATCH 005/160] Remove plugin references from base_container.data --- installer/datafiles/base_container.data | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index c49a8d1d0..ec0728c01 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -23,14 +23,11 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/filter_docker_log.rb; source/code/plugin/filter_docker_log.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_container.rb; source/code/plugin/filter_container.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_container_log.rb; source/code/plugin/filter_container_log.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_podinventory.rb; source/code/plugin/in_kube_podinventory.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_events.rb; source/code/plugin/in_kube_events.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_logs.rb; source/code/plugin/in_kube_logs.rb; 644; root; root /opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/code/plugin/KubernetesApiClient.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_containerlog_sudo_tail.rb; source/code/plugin/in_containerlog_sudo_tail.rb; 644; root; root -/opt/microsoft/omsagent/plugin/containerlogtailfilereader.rb; source/code/plugin/containerlogtailfilereader.rb; 744; root; root /etc/opt/microsoft/docker-cimprov/container.conf; installer/conf/container.conf; 644; root; root @@ -88,15 +85,6 @@ WriteInstallInfo() { } WriteInstallInfo -#Setup sudo permission for containerlogtailfilereader -if [ -z $(cat /etc/sudoers.d/omsagent | grep /containerlogtailfilereader.rb) ] -then - chmod +w /etc/sudoers.d/omsagent - echo "#run containerlogtailfilereader.rb for docker-provider" >> /etc/sudoers.d/omsagent - echo "omsagent ALL=(ALL) NOPASSWD: /opt/microsoft/omsagent/ruby/bin/ruby /opt/microsoft/omsagent/plugin/containerlogtailfilereader.rb *" >> /etc/sudoers.d/omsagent - chmod 440 /etc/sudoers.d/omsagent -fi - # Get the state file in place with proper permissions touch /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt chmod 644 /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt From bcd1a3ff040eb25218cfffd5028394f7594075c7 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 14 Sep 2018 10:46:55 -0700 Subject: [PATCH 006/160] Dilipr/fluent bit log processing (#126) * Build out_oms.so and include in docker-cimprov package * Adding fluent-bit-config file to base container * PR Feedback * Adding out_oms.conf to base_container.data * PR Feedback * Making the critical section as small as possible * PR Feedback * Fixing the newline bug for Computer, and changing containerId to Id --- build/Makefile | 829 ++++++++++++------------ installer/conf/out_oms.conf | 6 + installer/conf/td-agent-bit.conf | 35 + installer/datafiles/base_container.data | 7 +- source/code/go/src/plugins/Makefile | 20 + source/code/go/src/plugins/glide.lock | 209 ++++++ source/code/go/src/plugins/glide.yaml | 15 + source/code/go/src/plugins/oms.go | 359 ++++++++++ source/code/go/src/plugins/out_oms.go | 57 ++ source/code/go/src/plugins/utils.go | 67 ++ 10 files changed, 1194 insertions(+), 410 deletions(-) create mode 100644 installer/conf/out_oms.conf create mode 100644 installer/conf/td-agent-bit.conf create mode 100644 source/code/go/src/plugins/Makefile create mode 100644 source/code/go/src/plugins/glide.lock create mode 100644 source/code/go/src/plugins/glide.yaml create mode 100644 source/code/go/src/plugins/oms.go create mode 100644 source/code/go/src/plugins/out_oms.go create mode 100644 source/code/go/src/plugins/utils.go diff --git a/build/Makefile b/build/Makefile index 9586c3b23..b5312cfe3 100644 --- a/build/Makefile +++ b/build/Makefile @@ -1,409 +1,420 @@ -# -*- mode: Makefile; -*- -# Copyright (c) Microsoft Corporation - -BASE_DIR := $(subst /build,,$(PWD)) -OMI_ROOT := $(shell cd ../../omi/Unix; pwd -P) -SCXPAL_DIR := $(shell cd ../../pal; pwd -P) - -PF_POSIX := 1 -include $(SCXPAL_DIR)/build/config.mak -include $(BASE_DIR)/build/config.mak -include $(SCXPAL_DIR)/build/Makefile.pal - -ifndef ENABLE_DEBUG -$(error "ENABLE_DEBUG is not set. Please re-run configure") -endif - -# Include the version file -include ../../docker.version - -ifndef CONTAINER_BUILDVERSION_STATUS -$(error "Is docker.version missing? Please re-run configure") -endif - -SOURCE_DIR := $(BASE_DIR)/source/code -TEST_DIR := $(BASE_DIR)/test/code - -PROVIDER_DIR := $(SOURCE_DIR)/providers -PROVIDER_TEST_DIR := $(TEST_DIR)/providers -PAL_INCLUDE_DIR := $(SCXPAL_DIR)/source/code/include -PAL_TESTUTILS_DIR := $(SCXPAL_DIR)/test/code/testutils - -INTERMEDIATE_DIR := $(BASE_DIR)/intermediate/$(BUILD_CONFIGURATION) -INTERMEDIATE_TESTFILES := $(INTERMEDIATE_DIR)/testfiles -TARGET_DIR := $(BASE_DIR)/target/$(BUILD_CONFIGURATION) -PROVIDER_LIBRARY := $(INTERMEDIATE_DIR)/libcontainer.so - -INSTALLER_TMPDIR := $(INTERMEDIATE_DIR)/installer_tmp - -# Include files - -INCLUDE_DEFINES := $(INTERMEDIATE_DIR)/defines.h - -# Compiler flags - -OMI_INCLUDE_FLAGS := -I$(OMI_ROOT)/output/include -PROVIDER_INCLUDE_FLAGS := -I$(PAL_INCLUDE_DIR) -I$(INTERMEDIATE_DIR) - -PROVIDER_TEST_INCLUDE_FLAGS := -Wmissing-include-dirs -Wno-non-virtual-dtor -I$(SCXPAL_DIR)/source/code/include -I$(INTERMEDIATE_DIR) -I$(SCXPAL_DIR)/test/ext/include -I$(OMI_ROOT)/output/include -I$(OMI_ROOT) -I$(OMI_ROOT)/common -I$(SCXPAL_DIR)/test/code/include $(PROVIDER_INCLUDE_FLAGS) -I$(PROVIDER_DIR) - -ifeq ($(ENABLE_DEBUG),1) -PROV_DEBUG_FLAGS := -g -endif - -COMPILE_FLAGS := $(PROV_DEBUG_FLAGS) -D_REENTRANT -fstack-protector-all -Wall -fno-nonansi-builtins -Woverloaded-virtual -Wformat -Wformat-security -Wcast-align -Wswitch-enum -Wshadow -Wwrite-strings -Wredundant-decls -Wcast-qual -fPIC -PROVIDER_COMPILE_FLAGS := $(COMPILE_FLAGS) - -LINK_LIBRARIES := -Wl,-rpath=/opt/omi/lib -L$(OMI_ROOT)/output/lib -lmicxx -L$(SCXPAL_TARGET_DIR) -lscxcore -lUtil -lscxassertabort -lrt -luuid -PROVIDER_TEST_LINK_LIBRARIES := -lbase -lpal -L$(SCXPAL_TARGET_DIR) -lscxcore $(SCXPAL_DIR)/test/ext/lib/linux/$(ARCH)/cppunit/libcppunit.a -lpthread -lrt -luuid - -SHARED_FLAGS := -shared - -# Support for installbuilder - -STAGING_DIR := $(INTERMEDIATE_DIR)/staging - -ifeq ($(ULINUX),1) - # For consistency, the architecture should be i686 (for x86) and x86_64 (for x64) - DOCKER_ARCH := $(shell echo $(PF_ARCH) | sed -e 's/x86$$/i686/' -e 's/x64$$/x86_64/') - OUTPUT_PACKAGE_PREFIX=docker-cimprov-$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH)-$(CONTAINER_BUILDVERSION_BUILDNR).universal.$(DOCKER_ARCH) -else - PF_DISTRO_LC := $(shell echo $(PF_DISTRO) | tr A-Z a-z) - OUTPUT_PACKAGE_PREFIX=docker-cimprov-$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH)-$(CONTAINER_BUILDVERSION_BUILDNR).$(PF_DISTRO_LC).$(PF_MAJOR).$(PF_ARCH) -endif - -ifeq ("$(wildcard /usr/bin/dpkg-deb)","") - DPKG_LOCATION="--DPKG_LOCATION=$(SCXPAL_DIR)/installer/InstallBuilder/tools/bin/dpkg-deb-$(PF_ARCH)" -else - DPKG_LOCATION= -endif - -# Support for src_to_obj handling - -INCLUDES = $(OMI_INCLUDE_FLAGS) $(PROVIDER_INCLUDE_FLAGS) -CFLAGS = $(COMPILE_FLAGS) -CXXFLAGS = $(COMPILE_FLAGS) - -#-------------------------------------------------------------------------------- -# Build targets - -ifeq ($(ULINUX),1) -all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit -else -all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) -endif - -clean : - $(RMDIR) $(BASE_DIR)/build/cppunit_result.* $(BASE_DIR)/build/scxtestrunner.log $(BASE_DIR)/installer/intermediate $(BASE_DIR)/intermediate $(BASE_DIR)/target $(PROVIDER_TEST_DIR)/providertestutils.cpp - -find $(BASE_DIR) -name \*~ -exec rm {} \; - -$(RM) $(TEST_DIR)/providers/TestScriptPath.h - -distclean : clean - $(RM) $(BASE_DIR)/build/config.mak - -make -C $(OMI_ROOT) distclean - -make -C $(SCXPAL_DIR)/build distclean - -$(RMDIR) $(OMI_ROOT)/output* - -$(RM) $(SCXPAL_DIR)/build/config.mak - -$(RM) $(SCXPAL_DIR)/build/Makefile.config_cache - -PROVIDER_STATUS: - @echo "========================= Performing Building provider" - -KIT_STATUS: - @echo "========================= Performing Building provider tests" - -#-------------------------------------------------------------------------------- -# OMI build -# -# Build the OMI distribution -# -# Technically, we should go to build OMI all the time. But I'd rather not spend -# the time doing it here EVERY TIME, when we never normally change OMI. This is -# a good tradeoff (build if not built, otherwise assume all is well). -# -# Doing a 'make clean' in OMI directory will force us to rebuild. - -$(OMI_ROOT)/output : $(OMI_ROOT)/output/lib/libmicxx.so - -$(OMI_ROOT)/output/lib/libmicxx.so : - @echo "========================= Performing Building OMI" - make -C $(OMI_ROOT) -ifeq ($(PERFORM_OMI_MAKEINSTALL),1) - make -C $(OMI_ROOT) install -endif - -#-------------------------------------------------------------------------------- -# PAL build -# -# Build the PAL (Platform Abstraction Layer) -# -# Doing a 'make clean' in PAL directory will force us to rebuild. - -$(SCXPAL_INTERMEDIATE_DIR) : - @echo "========================= Performing Building PAL" - make -C $(SCXPAL_DIR)/build - -#================================================================================ -# File depends.h (compiler dependencies) -#================================================================================ - -$(INCLUDE_DEFINES) : $(BASE_DIR)/build/config.mak - -$(MKPATH) $(@D) - @$(ECHO) "Creating $@" - @$(call pf_fwrite,"/*-------------------------------------------------------------------------------", $@) - @$(call pf_fappend," Copyright (C) 2007-2015 Microsoft Corp. ", $@) - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"*/ ", $@) - @$(call pf_fappend,"/** ", $@) - @$(call pf_fappend," \file ", $@) - @$(call pf_fappend," ", $@) - @$(call pf_fappend," \brief Auto generated file containing build definitions ", $@) - @$(call pf_fappend," ", $@) - @$(call pf_fappend," \author Automated Build System ", $@) - @$(call pf_fappend," ", $@) - @$(call pf_fappend," DO NOT EDIT THIS FILE! ", $@) - @$(call pf_fappend," DO NOT CHECK IN THIS FILE! ", $@) - @$(call pf_fappend,"*/ ", $@) - @$(call pf_fappend,"/*----------------------------------------------------------------------------*/", $@) - @$(call pf_fappend,"#ifndef DEFINES_H ", $@) - @$(call pf_fappend,"#define DEFINES_H ", $@) - @$(call pf_fappend," ", $@) -ifneq ($(PF_DISTRO),) - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"#ifndef PF_DISTRO_$(PF_DISTRO) ", $@) - @$(call pf_fappend,"#define PF_DISTRO_$(PF_DISTRO) ", $@) - @$(call pf_fappend,"#endif ", $@) -endif -ifneq ($(PF_MAJOR),) - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"#ifndef PF_MAJOR ", $@) - @$(call pf_fappend,"#define PF_MAJOR $(PF_MAJOR) ", $@) - @$(call pf_fappend,"#endif ", $@) -endif -ifneq ($(PF_MINOR),) - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"#ifndef PF_MINOR ", $@) - @$(call pf_fappend,"#define PF_MINOR $(PF_MINOR) ", $@) - @$(call pf_fappend,"#endif ", $@) -endif -ifneq ($(ARCH),) - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"#ifndef $(ARCH) ", $@) - @$(call pf_fappend,"#define $(ARCH) ", $@) - @$(call pf_fappend,"#endif ", $@) -endif -ifeq ($(BUILD_TYPE),Debug) - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"#ifndef _DEBUG ", $@) - @$(call pf_fappend,"#define _DEBUG ", $@) - @$(call pf_fappend,"#endif ", $@) -else - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"#ifndef NDEBUG ", $@) - @$(call pf_fappend,"#define NDEBUG ", $@) - @$(call pf_fappend,"#endif ", $@) -endif - @$(call pf_fappend," ", $@) - @$(call pf_fappend,"#endif /* DEFINES_H */ ", $@) - @$(call pf_fappend,"/*----------------------------E-N-D---O-F---F-I-L-E---------------------------*/", $@) - -#================================================================================ -# Internal functions -#================================================================================ - -# Convert a list of src files with absolute paths under BASE_DIR to corresponding -# object files under intermediate directory -# src_to_obj(list_of_cppfiles) -src_to_obj = $(patsubst $(BASE_DIR)%, $(INTERMEDIATE_DIR)%, $(patsubst %.c, %.o, $(patsubst %.cpp, %.o, $(1)))) - -# No default rules, please -.SUFFIX: - -# Rule for compiling cpp files in source tree, ouptut in mirrored intermediate dir -$(INTERMEDIATE_DIR)/%.o : $(BASE_DIR)/%.cpp $(INCLUDE_DEFINES) - $(MKPATH) $(@D) - $(CXX) -c $(CXXFLAGS) $(INCLUDES) -I$( $(TEST_DIR)/providers/TestScriptPath.h - -test : TEST_STATUS $(SCXPAL_INTERMEDIATE_DIR) $(INTERMEDIATE_DIR)/testrunner - @echo "========================= Performing container testrun execution" - $(MKPATH) $(INTERMEDIATE_TESTFILES) - $(COPY) $(TEST_DIR)/scripts/createEnv.sh $(TEST_DIR)/scripts/testrun_wrapper $(INTERMEDIATE_TESTFILES) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(OMI_ROOT)/output/lib; cd $(INTERMEDIATE_TESTFILES); ./createEnv.sh - cd $(INTERMEDIATE_TESTFILES); ./testrun_wrapper $(INTERMEDIATE_DIR) - -#-------------------------------------------------------------------------------- -# Build the distribution kit -# -# Build the packages via installbuilder -# -# While the "formal build" only builds ULINUX, we may build something else for DEV purposes. -# Assume we ALWAYS build DPKG, but only build RPM if --enable-ulinux is speified in configure. - -kit : CONTAINERLIB_FILENAME = libcontainer.so -kit : $(OMI_ROOT)/output $(PROVIDER_LIBRARY) - -ifeq ($(ULINUX),1) - - @echo "========================= Performing Building RPM and DPKG packages" - $(MKPATH) $(INSTALLER_TMPDIR) - sudo $(RMDIR) $(STAGING_DIR) - $(MKPATH) $(INTERMEDIATE_DIR) - python $(SCXPAL_DIR)/installer/InstallBuilder/installbuilder.py \ - --BASE_DIR=$(BASE_DIR) \ - --TARGET_DIR=$(INTERMEDIATE_DIR) \ - --INTERMEDIATE_DIR=$(INSTALLER_TMPDIR) \ - --STAGING_DIR=$(STAGING_DIR) \ - --BUILD_TYPE=$(BUILD_TYPE) \ - --BUILD_CONFIGURATION=$(BUILD_CONFIGURATION) \ - --PFARCH=$(PF_ARCH) \ - --PFDISTRO=$(PF_DISTRO) \ - --PFMAJOR=$(PF_MAJOR) \ - --PFMINOR=$(PF_MINOR) \ - --VERSION=$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH) \ - --RELEASE=$(CONTAINER_BUILDVERSION_BUILDNR) \ - --CONTAINER_BUILD_LIBRARY=$(CONTAINERLIB_FILENAME) \ - --OUTPUTFILE=$(OUTPUT_PACKAGE_PREFIX) \ - --DATAFILE_PATH=$(BASE_DIR)/installer/datafiles \ - base_container.data linux.data linux_rpm.data - - sudo $(RMDIR) $(STAGING_DIR) - $(MKPATH) $(INTERMEDIATE_DIR) - python $(SCXPAL_DIR)/installer/InstallBuilder/installbuilder.py \ - --BASE_DIR=$(BASE_DIR) \ - --TARGET_DIR=$(INTERMEDIATE_DIR) \ - --INTERMEDIATE_DIR=$(INSTALLER_TMPDIR) \ - --STAGING_DIR=$(STAGING_DIR) \ - --BUILD_TYPE=$(BUILD_TYPE) \ - --BUILD_CONFIGURATION=$(BUILD_CONFIGURATION) \ - --PFARCH=$(PF_ARCH) \ - --PFDISTRO=$(PF_DISTRO) \ - --PFMAJOR=$(PF_MAJOR) \ - --PFMINOR=$(PF_MINOR) \ - --VERSION=$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH) \ - --RELEASE=$(CONTAINER_BUILDVERSION_BUILDNR) \ - --CONTAINER_BUILD_LIBRARY=$(CONTAINERLIB_FILENAME) \ - $(DPKG_LOCATION) \ - --OUTPUTFILE=$(OUTPUT_PACKAGE_PREFIX) \ - --DATAFILE_PATH=$(BASE_DIR)/installer/datafiles \ - base_container.data linux.data linux_dpkg.data - - # Strip the package extension from the package filename - sed -re 's/.rpm$$|.deb$$//' $(INTERMEDIATE_DIR)/package_filename > $(INTERMEDIATE_DIR)/package_file.tmp; mv $(INTERMEDIATE_DIR)/package_file.tmp $(INTERMEDIATE_DIR)/package_filename - - # Build the tar file containing both .rpm and .deb packages - cd $(INTERMEDIATE_DIR); tar cvf $(OUTPUT_PACKAGE_PREFIX).tar $(OUTPUT_PACKAGE_PREFIX).rpm $(OUTPUT_PACKAGE_PREFIX).deb - - ../installer/bundle/create_bundle.sh $(PF)_$(PF_DISTRO) $(INTERMEDIATE_DIR) $(OUTPUT_PACKAGE_PREFIX) - # Copy the shell bundle to the target directory - $(MKPATH) $(TARGET_DIR) - cd $(INTERMEDIATE_DIR); $(COPY) `cat $(INTERMEDIATE_DIR)/package_filename`.sh $(TARGET_DIR) - -else - - @echo "========================= Performing Building RPM and DPKG packages" - sudo $(RMDIR) $(STAGING_DIR) - $(MKPATH) $(INTERMEDIATE_DIR) - python $(SCXPAL_DIR)/installer/InstallBuilder/installbuilder.py \ - --BASE_DIR=$(BASE_DIR) \ - --TARGET_DIR=$(INTERMEDIATE_DIR) \ - --INTERMEDIATE_DIR=$(INSTALLER_TMPDIR) \ - --STAGING_DIR=$(STAGING_DIR) \ - --BUILD_TYPE=$(BUILD_TYPE) \ - --BUILD_CONFIGURATION=$(BUILD_CONFIGURATION) \ - --PFARCH=$(PF_ARCH) \ - --PFDISTRO=$(PF_DISTRO) \ - --PFMAJOR=$(PF_MAJOR) \ - --PFMINOR=$(PF_MINOR) \ - --VERSION=$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH) \ - --RELEASE=$(CONTAINER_BUILDVERSION_BUILDNR) \ - --CONTAINER_BUILD_LIBRARY=$(CONTAINERLIB_FILENAME) \ - $(DPKG_LOCATION) \ - --OUTPUTFILE=$(OUTPUT_PACKAGE_PREFIX) \ - --DATAFILE_PATH=$(BASE_DIR)/installer/datafiles \ - base_container.data linux.data linux_dpkg.data - -endif +# -*- mode: Makefile; -*- +# Copyright (c) Microsoft Corporation + +BASE_DIR := $(subst /build,,$(PWD)) +OMI_ROOT := $(shell cd ../../omi/Unix; pwd -P) +SCXPAL_DIR := $(shell cd ../../pal; pwd -P) + +PF_POSIX := 1 +include $(SCXPAL_DIR)/build/config.mak +include $(BASE_DIR)/build/config.mak +include $(SCXPAL_DIR)/build/Makefile.pal + +ifndef ENABLE_DEBUG +$(error "ENABLE_DEBUG is not set. Please re-run configure") +endif + +# Include the version file +include ../../docker.version + +ifndef CONTAINER_BUILDVERSION_STATUS +$(error "Is docker.version missing? Please re-run configure") +endif + +SOURCE_DIR := $(BASE_DIR)/source/code +TEST_DIR := $(BASE_DIR)/test/code + +PROVIDER_DIR := $(SOURCE_DIR)/providers +PROVIDER_TEST_DIR := $(TEST_DIR)/providers +PAL_INCLUDE_DIR := $(SCXPAL_DIR)/source/code/include +PAL_TESTUTILS_DIR := $(SCXPAL_DIR)/test/code/testutils + +INTERMEDIATE_DIR := $(BASE_DIR)/intermediate/$(BUILD_CONFIGURATION) +INTERMEDIATE_TESTFILES := $(INTERMEDIATE_DIR)/testfiles +TARGET_DIR := $(BASE_DIR)/target/$(BUILD_CONFIGURATION) +PROVIDER_LIBRARY := $(INTERMEDIATE_DIR)/libcontainer.so + +INSTALLER_TMPDIR := $(INTERMEDIATE_DIR)/installer_tmp + +# GO Source dir for custom fluent bit plugin +GO_SOURCE_DIR := $(SOURCE_DIR)/go/src/plugins + +# Include files + +INCLUDE_DEFINES := $(INTERMEDIATE_DIR)/defines.h + +# Compiler flags + +OMI_INCLUDE_FLAGS := -I$(OMI_ROOT)/output/include +PROVIDER_INCLUDE_FLAGS := -I$(PAL_INCLUDE_DIR) -I$(INTERMEDIATE_DIR) + +PROVIDER_TEST_INCLUDE_FLAGS := -Wmissing-include-dirs -Wno-non-virtual-dtor -I$(SCXPAL_DIR)/source/code/include -I$(INTERMEDIATE_DIR) -I$(SCXPAL_DIR)/test/ext/include -I$(OMI_ROOT)/output/include -I$(OMI_ROOT) -I$(OMI_ROOT)/common -I$(SCXPAL_DIR)/test/code/include $(PROVIDER_INCLUDE_FLAGS) -I$(PROVIDER_DIR) + +ifeq ($(ENABLE_DEBUG),1) +PROV_DEBUG_FLAGS := -g +endif + +COMPILE_FLAGS := $(PROV_DEBUG_FLAGS) -D_REENTRANT -fstack-protector-all -Wall -fno-nonansi-builtins -Woverloaded-virtual -Wformat -Wformat-security -Wcast-align -Wswitch-enum -Wshadow -Wwrite-strings -Wredundant-decls -Wcast-qual -fPIC +PROVIDER_COMPILE_FLAGS := $(COMPILE_FLAGS) + +LINK_LIBRARIES := -Wl,-rpath=/opt/omi/lib -L$(OMI_ROOT)/output/lib -lmicxx -L$(SCXPAL_TARGET_DIR) -lscxcore -lUtil -lscxassertabort -lrt -luuid +PROVIDER_TEST_LINK_LIBRARIES := -lbase -lpal -L$(SCXPAL_TARGET_DIR) -lscxcore $(SCXPAL_DIR)/test/ext/lib/linux/$(ARCH)/cppunit/libcppunit.a -lpthread -lrt -luuid + +SHARED_FLAGS := -shared + +# Support for installbuilder + +STAGING_DIR := $(INTERMEDIATE_DIR)/staging + +ifeq ($(ULINUX),1) + # For consistency, the architecture should be i686 (for x86) and x86_64 (for x64) + DOCKER_ARCH := $(shell echo $(PF_ARCH) | sed -e 's/x86$$/i686/' -e 's/x64$$/x86_64/') + OUTPUT_PACKAGE_PREFIX=docker-cimprov-$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH)-$(CONTAINER_BUILDVERSION_BUILDNR).universal.$(DOCKER_ARCH) +else + PF_DISTRO_LC := $(shell echo $(PF_DISTRO) | tr A-Z a-z) + OUTPUT_PACKAGE_PREFIX=docker-cimprov-$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH)-$(CONTAINER_BUILDVERSION_BUILDNR).$(PF_DISTRO_LC).$(PF_MAJOR).$(PF_ARCH) +endif + +ifeq ("$(wildcard /usr/bin/dpkg-deb)","") + DPKG_LOCATION="--DPKG_LOCATION=$(SCXPAL_DIR)/installer/InstallBuilder/tools/bin/dpkg-deb-$(PF_ARCH)" +else + DPKG_LOCATION= +endif + +# Support for src_to_obj handling + +INCLUDES = $(OMI_INCLUDE_FLAGS) $(PROVIDER_INCLUDE_FLAGS) +CFLAGS = $(COMPILE_FLAGS) +CXXFLAGS = $(COMPILE_FLAGS) + +#-------------------------------------------------------------------------------- +# Build targets + +ifeq ($(ULINUX),1) +all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit fluentbitplugin +else +all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) fluentbitplugin +endif + +clean : + $(RMDIR) $(BASE_DIR)/build/cppunit_result.* $(BASE_DIR)/build/scxtestrunner.log $(BASE_DIR)/installer/intermediate $(BASE_DIR)/intermediate $(BASE_DIR)/target $(PROVIDER_TEST_DIR)/providertestutils.cpp + -find $(BASE_DIR) -name \*~ -exec rm {} \; + -$(RM) $(TEST_DIR)/providers/TestScriptPath.h + +distclean : clean + $(RM) $(BASE_DIR)/build/config.mak + -make -C $(OMI_ROOT) distclean + -make -C $(SCXPAL_DIR)/build distclean + -$(RMDIR) $(OMI_ROOT)/output* + -$(RM) $(SCXPAL_DIR)/build/config.mak + -$(RM) $(SCXPAL_DIR)/build/Makefile.config_cache + +PROVIDER_STATUS: + @echo "========================= Performing Building provider" + +KIT_STATUS: + @echo "========================= Performing Building provider tests" + +#-------------------------------------------------------------------------------- +# OMI build +# +# Build the OMI distribution +# +# Technically, we should go to build OMI all the time. But I'd rather not spend +# the time doing it here EVERY TIME, when we never normally change OMI. This is +# a good tradeoff (build if not built, otherwise assume all is well). +# +# Doing a 'make clean' in OMI directory will force us to rebuild. + +$(OMI_ROOT)/output : $(OMI_ROOT)/output/lib/libmicxx.so + +$(OMI_ROOT)/output/lib/libmicxx.so : + @echo "========================= Performing Building OMI" + make -C $(OMI_ROOT) +ifeq ($(PERFORM_OMI_MAKEINSTALL),1) + make -C $(OMI_ROOT) install +endif + +#--------------------------------------------------------------------------------- +# fluentbit go plugin build. This is required to send container logs to ODS endpoint +# +fluentbitplugin : + @echo "========================= Building fluentbit out_oms go plugin for logs" + make -C $(GO_SOURCE_DIR) fbplugin + $(COPY) $(GO_SOURCE_DIR)/out_oms.so $(INTERMEDIATE_DIR) + +#-------------------------------------------------------------------------------- +# PAL build +# +# Build the PAL (Platform Abstraction Layer) +# +# Doing a 'make clean' in PAL directory will force us to rebuild. + +$(SCXPAL_INTERMEDIATE_DIR) : + @echo "========================= Performing Building PAL" + make -C $(SCXPAL_DIR)/build + +#================================================================================ +# File depends.h (compiler dependencies) +#================================================================================ + +$(INCLUDE_DEFINES) : $(BASE_DIR)/build/config.mak + -$(MKPATH) $(@D) + @$(ECHO) "Creating $@" + @$(call pf_fwrite,"/*-------------------------------------------------------------------------------", $@) + @$(call pf_fappend," Copyright (C) 2007-2015 Microsoft Corp. ", $@) + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"*/ ", $@) + @$(call pf_fappend,"/** ", $@) + @$(call pf_fappend," \file ", $@) + @$(call pf_fappend," ", $@) + @$(call pf_fappend," \brief Auto generated file containing build definitions ", $@) + @$(call pf_fappend," ", $@) + @$(call pf_fappend," \author Automated Build System ", $@) + @$(call pf_fappend," ", $@) + @$(call pf_fappend," DO NOT EDIT THIS FILE! ", $@) + @$(call pf_fappend," DO NOT CHECK IN THIS FILE! ", $@) + @$(call pf_fappend,"*/ ", $@) + @$(call pf_fappend,"/*----------------------------------------------------------------------------*/", $@) + @$(call pf_fappend,"#ifndef DEFINES_H ", $@) + @$(call pf_fappend,"#define DEFINES_H ", $@) + @$(call pf_fappend," ", $@) +ifneq ($(PF_DISTRO),) + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"#ifndef PF_DISTRO_$(PF_DISTRO) ", $@) + @$(call pf_fappend,"#define PF_DISTRO_$(PF_DISTRO) ", $@) + @$(call pf_fappend,"#endif ", $@) +endif +ifneq ($(PF_MAJOR),) + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"#ifndef PF_MAJOR ", $@) + @$(call pf_fappend,"#define PF_MAJOR $(PF_MAJOR) ", $@) + @$(call pf_fappend,"#endif ", $@) +endif +ifneq ($(PF_MINOR),) + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"#ifndef PF_MINOR ", $@) + @$(call pf_fappend,"#define PF_MINOR $(PF_MINOR) ", $@) + @$(call pf_fappend,"#endif ", $@) +endif +ifneq ($(ARCH),) + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"#ifndef $(ARCH) ", $@) + @$(call pf_fappend,"#define $(ARCH) ", $@) + @$(call pf_fappend,"#endif ", $@) +endif +ifeq ($(BUILD_TYPE),Debug) + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"#ifndef _DEBUG ", $@) + @$(call pf_fappend,"#define _DEBUG ", $@) + @$(call pf_fappend,"#endif ", $@) +else + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"#ifndef NDEBUG ", $@) + @$(call pf_fappend,"#define NDEBUG ", $@) + @$(call pf_fappend,"#endif ", $@) +endif + @$(call pf_fappend," ", $@) + @$(call pf_fappend,"#endif /* DEFINES_H */ ", $@) + @$(call pf_fappend,"/*----------------------------E-N-D---O-F---F-I-L-E---------------------------*/", $@) + +#================================================================================ +# Internal functions +#================================================================================ + +# Convert a list of src files with absolute paths under BASE_DIR to corresponding +# object files under intermediate directory +# src_to_obj(list_of_cppfiles) +src_to_obj = $(patsubst $(BASE_DIR)%, $(INTERMEDIATE_DIR)%, $(patsubst %.c, %.o, $(patsubst %.cpp, %.o, $(1)))) + +# No default rules, please +.SUFFIX: + +# Rule for compiling cpp files in source tree, ouptut in mirrored intermediate dir +$(INTERMEDIATE_DIR)/%.o : $(BASE_DIR)/%.cpp $(INCLUDE_DEFINES) + $(MKPATH) $(@D) + $(CXX) -c $(CXXFLAGS) $(INCLUDES) -I$( $(TEST_DIR)/providers/TestScriptPath.h + +test : TEST_STATUS $(SCXPAL_INTERMEDIATE_DIR) $(INTERMEDIATE_DIR)/testrunner + @echo "========================= Performing container testrun execution" + $(MKPATH) $(INTERMEDIATE_TESTFILES) + $(COPY) $(TEST_DIR)/scripts/createEnv.sh $(TEST_DIR)/scripts/testrun_wrapper $(INTERMEDIATE_TESTFILES) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(OMI_ROOT)/output/lib; cd $(INTERMEDIATE_TESTFILES); ./createEnv.sh + cd $(INTERMEDIATE_TESTFILES); ./testrun_wrapper $(INTERMEDIATE_DIR) + +#-------------------------------------------------------------------------------- +# Build the distribution kit +# +# Build the packages via installbuilder +# +# While the "formal build" only builds ULINUX, we may build something else for DEV purposes. +# Assume we ALWAYS build DPKG, but only build RPM if --enable-ulinux is speified in configure. + +kit : CONTAINERLIB_FILENAME = libcontainer.so +kit : $(OMI_ROOT)/output $(PROVIDER_LIBRARY) fluentbitplugin + +ifeq ($(ULINUX),1) + + @echo "========================= Performing Building RPM and DPKG packages" + $(MKPATH) $(INSTALLER_TMPDIR) + sudo $(RMDIR) $(STAGING_DIR) + $(MKPATH) $(INTERMEDIATE_DIR) + python $(SCXPAL_DIR)/installer/InstallBuilder/installbuilder.py \ + --BASE_DIR=$(BASE_DIR) \ + --TARGET_DIR=$(INTERMEDIATE_DIR) \ + --INTERMEDIATE_DIR=$(INSTALLER_TMPDIR) \ + --STAGING_DIR=$(STAGING_DIR) \ + --BUILD_TYPE=$(BUILD_TYPE) \ + --BUILD_CONFIGURATION=$(BUILD_CONFIGURATION) \ + --PFARCH=$(PF_ARCH) \ + --PFDISTRO=$(PF_DISTRO) \ + --PFMAJOR=$(PF_MAJOR) \ + --PFMINOR=$(PF_MINOR) \ + --VERSION=$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH) \ + --RELEASE=$(CONTAINER_BUILDVERSION_BUILDNR) \ + --CONTAINER_BUILD_LIBRARY=$(CONTAINERLIB_FILENAME) \ + --OUTPUTFILE=$(OUTPUT_PACKAGE_PREFIX) \ + --DATAFILE_PATH=$(BASE_DIR)/installer/datafiles \ + base_container.data linux.data linux_rpm.data + + sudo $(RMDIR) $(STAGING_DIR) + $(MKPATH) $(INTERMEDIATE_DIR) + python $(SCXPAL_DIR)/installer/InstallBuilder/installbuilder.py \ + --BASE_DIR=$(BASE_DIR) \ + --TARGET_DIR=$(INTERMEDIATE_DIR) \ + --INTERMEDIATE_DIR=$(INSTALLER_TMPDIR) \ + --STAGING_DIR=$(STAGING_DIR) \ + --BUILD_TYPE=$(BUILD_TYPE) \ + --BUILD_CONFIGURATION=$(BUILD_CONFIGURATION) \ + --PFARCH=$(PF_ARCH) \ + --PFDISTRO=$(PF_DISTRO) \ + --PFMAJOR=$(PF_MAJOR) \ + --PFMINOR=$(PF_MINOR) \ + --VERSION=$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH) \ + --RELEASE=$(CONTAINER_BUILDVERSION_BUILDNR) \ + --CONTAINER_BUILD_LIBRARY=$(CONTAINERLIB_FILENAME) \ + $(DPKG_LOCATION) \ + --OUTPUTFILE=$(OUTPUT_PACKAGE_PREFIX) \ + --DATAFILE_PATH=$(BASE_DIR)/installer/datafiles \ + base_container.data linux.data linux_dpkg.data + + # Strip the package extension from the package filename + sed -re 's/.rpm$$|.deb$$//' $(INTERMEDIATE_DIR)/package_filename > $(INTERMEDIATE_DIR)/package_file.tmp; mv $(INTERMEDIATE_DIR)/package_file.tmp $(INTERMEDIATE_DIR)/package_filename + + # Build the tar file containing both .rpm and .deb packages + cd $(INTERMEDIATE_DIR); tar cvf $(OUTPUT_PACKAGE_PREFIX).tar $(OUTPUT_PACKAGE_PREFIX).rpm $(OUTPUT_PACKAGE_PREFIX).deb + + ../installer/bundle/create_bundle.sh $(PF)_$(PF_DISTRO) $(INTERMEDIATE_DIR) $(OUTPUT_PACKAGE_PREFIX) + # Copy the shell bundle to the target directory + $(MKPATH) $(TARGET_DIR) + cd $(INTERMEDIATE_DIR); $(COPY) `cat $(INTERMEDIATE_DIR)/package_filename`.sh $(TARGET_DIR) + +else + + @echo "========================= Performing Building RPM and DPKG packages" + sudo $(RMDIR) $(STAGING_DIR) + $(MKPATH) $(INTERMEDIATE_DIR) + python $(SCXPAL_DIR)/installer/InstallBuilder/installbuilder.py \ + --BASE_DIR=$(BASE_DIR) \ + --TARGET_DIR=$(INTERMEDIATE_DIR) \ + --INTERMEDIATE_DIR=$(INSTALLER_TMPDIR) \ + --STAGING_DIR=$(STAGING_DIR) \ + --BUILD_TYPE=$(BUILD_TYPE) \ + --BUILD_CONFIGURATION=$(BUILD_CONFIGURATION) \ + --PFARCH=$(PF_ARCH) \ + --PFDISTRO=$(PF_DISTRO) \ + --PFMAJOR=$(PF_MAJOR) \ + --PFMINOR=$(PF_MINOR) \ + --VERSION=$(CONTAINER_BUILDVERSION_MAJOR).$(CONTAINER_BUILDVERSION_MINOR).$(CONTAINER_BUILDVERSION_PATCH) \ + --RELEASE=$(CONTAINER_BUILDVERSION_BUILDNR) \ + --CONTAINER_BUILD_LIBRARY=$(CONTAINERLIB_FILENAME) \ + $(DPKG_LOCATION) \ + --OUTPUTFILE=$(OUTPUT_PACKAGE_PREFIX) \ + --DATAFILE_PATH=$(BASE_DIR)/installer/datafiles \ + base_container.data linux.data linux_dpkg.data + +endif diff --git a/installer/conf/out_oms.conf b/installer/conf/out_oms.conf new file mode 100644 index 000000000..d4b797757 --- /dev/null +++ b/installer/conf/out_oms.conf @@ -0,0 +1,6 @@ +omsadmin_conf_path=/etc/opt/microsoft/omsagent/conf/omsadmin.conf +cert_file_path=/etc/opt/microsoft/omsagent/certs/oms.crt +key_file_path=/etc/opt/microsoft/omsagent/certs/oms.key +container_host_file_path=/var/opt/microsoft/docker-cimprov/state/containerhostname +container_inventory_refresh_interval=60 +kube_system_containers_refresh_interval=300 diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf new file mode 100644 index 000000000..cf490c077 --- /dev/null +++ b/installer/conf/td-agent-bit.conf @@ -0,0 +1,35 @@ +[SERVICE] + Flush 5 + Log_Level info + Parsers_File /etc/td-agent-bit/parsers.conf + Log_File /var/log/fluent-bit.log + +[INPUT] + Name tail + Tag oms.container.log.* + Path /var/log/containers/*.log + DB /var/log/fblogs.db + Parser docker + Mem_Buf_Limit 30m + Path_Key filepath + +[FILTER] + Name record_modifier + Match oms.container.log.* + Whitelist_key log + Whitelist_key stream + Whitelist_key time + Whitelist_key filepath + +[FILTER] + Name modify + Match oms.container.log.* + Rename log LogEntry + Rename stream LogEntrySource + Rename time LogEntryTimeStamp + Rename filepath Filepath + Add_if_not_present SourceSystem Containers + +[OUTPUT] + Name oms + Match oms.container.log.* \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index ec0728c01..85a128b2a 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -37,7 +37,9 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/in_kube_services.rb; source/code/plugin/in_kube_services.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root - +/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root @@ -76,6 +78,9 @@ MAINTAINER: 'Microsoft Corporation' /var/opt/microsoft/docker-cimprov/state/ImageInventory; 755; root; root /var/opt/microsoft/docker-cimprov/log; 755; root; root +/opt/td-agent-bit; 755; root; root;sysdir +/opt/td-agent-bit/bin; 755; root; root;sysdir + %Dependencies %Postinstall_10 diff --git a/source/code/go/src/plugins/Makefile b/source/code/go/src/plugins/Makefile new file mode 100644 index 000000000..dfdc65d81 --- /dev/null +++ b/source/code/go/src/plugins/Makefile @@ -0,0 +1,20 @@ +GITVERSION := 0.1 +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + BUILDDATE := $(shell date --rfc-3339=seconds) +endif +ifeq ($(UNAME_S),Darwin) + BUILDDATE := $(shell gdate --rfc-3339=seconds) +endif + +fbplugin: + go build -ldflags "-X 'main.revision=$(GITVERSION)' -X 'main.builddate=$(BUILDDATE)'" -buildmode=c-shared -o out_oms.so . + +test: + go test -cover -race -coverprofile=coverage.txt -covermode=atomic + +glide: + glide install + +clean: + rm -rf *.so *.h *~ diff --git a/source/code/go/src/plugins/glide.lock b/source/code/go/src/plugins/glide.lock new file mode 100644 index 000000000..79745820b --- /dev/null +++ b/source/code/go/src/plugins/glide.lock @@ -0,0 +1,209 @@ +hash: a4b073d827b5cbb4a772dada9ff3bcf55c55afc3cda83ddec1e6edcdca8e219a +updated: 2018-09-06T04:07:01.808678175Z +imports: +- name: github.com/fluent/fluent-bit-go + version: c4a158a6e3a793166c6ecfa2d5c80d71eada8959 + subpackages: + - output +- name: github.com/ghodss/yaml + version: 73d445a93680fa1a78ae23a5839bad48f32ba1ee +- name: github.com/gogo/protobuf + version: c0656edd0d9eab7c66d1eb0c568f9039345796f7 + subpackages: + - proto + - sortkeys +- name: github.com/golang/glog + version: 44145f04b68cf362d9c4df2182967c2275eaefed +- name: github.com/golang/protobuf + version: b4deda0973fb4c70b50d226b1af49f3da59f5265 + subpackages: + - proto + - ptypes + - ptypes/any + - ptypes/duration + - ptypes/timestamp +- name: github.com/google/btree + version: 7d79101e329e5a3adf994758c578dab82b90c017 +- name: github.com/google/gofuzz + version: 44d81051d367757e1c7c6a5a86423ece9afcf63c +- name: github.com/googleapis/gnostic + version: 0c5108395e2debce0d731cf0287ddf7242066aba + subpackages: + - OpenAPIv2 + - compiler + - extensions +- name: github.com/gregjones/httpcache + version: 787624de3eb7bd915c329cba748687a3b22666a6 + subpackages: + - diskcache +- name: github.com/json-iterator/go + version: f2b4162afba35581b6d4a50d3b8f34e33c144682 +- name: github.com/mitchellh/mapstructure + version: fa473d140ef3c6adf42d6b391fe76707f1f243c8 +- name: github.com/modern-go/concurrent + version: bacd9c7ef1dd9b15be4a9909b8ac7a4e313eec94 +- name: github.com/modern-go/reflect2 + version: 05fbef0ca5da472bbf96c9322b84a53edc03c9fd +- name: github.com/peterbourgon/diskv + version: 5f041e8faa004a95c88a202771f4cc3e991971e6 +- name: github.com/ugorji/go + version: 00b869d2f4a5e27445c2d916fa106fc72c106d4c + subpackages: + - codec +- name: golang.org/x/crypto + version: 49796115aa4b964c318aad4f3084fdb41e9aa067 + subpackages: + - ssh/terminal +- name: golang.org/x/net + version: 1c05540f6879653db88113bc4a2b70aec4bd491f + subpackages: + - context + - html + - html/atom + - http2 + - http2/hpack + - idna + - lex/httplex + - websocket +- name: golang.org/x/sys + version: 95c6576299259db960f6c5b9b69ea52422860fce + subpackages: + - unix + - windows +- name: golang.org/x/text + version: b19bf474d317b857955b12035d2c5acb57ce8b01 + subpackages: + - secure/bidirule + - transform + - unicode/bidi + - unicode/norm +- name: golang.org/x/time + version: f51c12702a4d776e4c1fa9b0fabab841babae631 + subpackages: + - rate +- name: gopkg.in/inf.v0 + version: 3887ee99ecf07df5b447e9b00d9c0b2adaa9f3e4 +- name: gopkg.in/yaml.v2 + version: 670d4cfef0544295bc27a114dbac37980d83185a +- name: k8s.io/api + version: 072894a440bdee3a891dea811fe42902311cd2a3 + subpackages: + - admissionregistration/v1alpha1 + - admissionregistration/v1beta1 + - apps/v1 + - apps/v1beta1 + - apps/v1beta2 + - authentication/v1 + - authentication/v1beta1 + - authorization/v1 + - authorization/v1beta1 + - autoscaling/v1 + - autoscaling/v2beta1 + - batch/v1 + - batch/v1beta1 + - batch/v2alpha1 + - certificates/v1beta1 + - core/v1 + - events/v1beta1 + - extensions/v1beta1 + - imagepolicy/v1alpha1 + - networking/v1 + - policy/v1beta1 + - rbac/v1 + - rbac/v1alpha1 + - rbac/v1beta1 + - scheduling/v1alpha1 + - scheduling/v1beta1 + - settings/v1alpha1 + - storage/v1 + - storage/v1alpha1 + - storage/v1beta1 +- name: k8s.io/apimachinery + version: 103fd098999dc9c0c88536f5c9ad2e5da39373ae + subpackages: + - pkg/api/errors + - pkg/api/meta + - pkg/api/resource + - pkg/apis/meta/v1 + - pkg/apis/meta/v1/unstructured + - pkg/apis/meta/v1beta1 + - pkg/conversion + - pkg/conversion/queryparams + - pkg/fields + - pkg/labels + - pkg/runtime + - pkg/runtime/schema + - pkg/runtime/serializer + - pkg/runtime/serializer/json + - pkg/runtime/serializer/protobuf + - pkg/runtime/serializer/recognizer + - pkg/runtime/serializer/streaming + - pkg/runtime/serializer/versioning + - pkg/selection + - pkg/types + - pkg/util/clock + - pkg/util/errors + - pkg/util/framer + - pkg/util/intstr + - pkg/util/json + - pkg/util/net + - pkg/util/runtime + - pkg/util/sets + - pkg/util/validation + - pkg/util/validation/field + - pkg/util/wait + - pkg/util/yaml + - pkg/version + - pkg/watch + - third_party/forked/golang/reflect +- name: k8s.io/client-go + version: 7d04d0e2a0a1a4d4a1cd6baa432a2301492e4e65 + subpackages: + - discovery + - kubernetes + - kubernetes/scheme + - kubernetes/typed/admissionregistration/v1alpha1 + - kubernetes/typed/admissionregistration/v1beta1 + - kubernetes/typed/apps/v1 + - kubernetes/typed/apps/v1beta1 + - kubernetes/typed/apps/v1beta2 + - kubernetes/typed/authentication/v1 + - kubernetes/typed/authentication/v1beta1 + - kubernetes/typed/authorization/v1 + - kubernetes/typed/authorization/v1beta1 + - kubernetes/typed/autoscaling/v1 + - kubernetes/typed/autoscaling/v2beta1 + - kubernetes/typed/batch/v1 + - kubernetes/typed/batch/v1beta1 + - kubernetes/typed/batch/v2alpha1 + - kubernetes/typed/certificates/v1beta1 + - kubernetes/typed/core/v1 + - kubernetes/typed/events/v1beta1 + - kubernetes/typed/extensions/v1beta1 + - kubernetes/typed/networking/v1 + - kubernetes/typed/policy/v1beta1 + - kubernetes/typed/rbac/v1 + - kubernetes/typed/rbac/v1alpha1 + - kubernetes/typed/rbac/v1beta1 + - kubernetes/typed/scheduling/v1alpha1 + - kubernetes/typed/scheduling/v1beta1 + - kubernetes/typed/settings/v1alpha1 + - kubernetes/typed/storage/v1 + - kubernetes/typed/storage/v1alpha1 + - kubernetes/typed/storage/v1beta1 + - pkg/apis/clientauthentication + - pkg/apis/clientauthentication/v1alpha1 + - pkg/apis/clientauthentication/v1beta1 + - pkg/version + - plugin/pkg/client/auth/exec + - rest + - rest/watch + - tools/clientcmd/api + - tools/metrics + - tools/reference + - transport + - util/cert + - util/connrotation + - util/flowcontrol + - util/integer +testImports: [] diff --git a/source/code/go/src/plugins/glide.yaml b/source/code/go/src/plugins/glide.yaml new file mode 100644 index 000000000..b986ece21 --- /dev/null +++ b/source/code/go/src/plugins/glide.yaml @@ -0,0 +1,15 @@ +package: plugins +import: +- package: github.com/fluent/fluent-bit-go + subpackages: + - output +- package: github.com/mitchellh/mapstructure + version: ^1.0.0 +- package: k8s.io/apimachinery + subpackages: + - pkg/apis/meta/v1 +- package: k8s.io/client-go + version: ^8.0.0 + subpackages: + - kubernetes + - rest diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go new file mode 100644 index 000000000..49472c74b --- /dev/null +++ b/source/code/go/src/plugins/oms.go @@ -0,0 +1,359 @@ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "log" + "net/http" + "os" + "strconv" + "strings" + "sync" + "time" + + "github.com/fluent/fluent-bit-go/output" + "github.com/mitchellh/mapstructure" + lumberjack "gopkg.in/natefinch/lumberjack.v2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// DataType for Container Log +const DataType = "CONTAINER_LOG_BLOB" + +// IPName for Container Log +const IPName = "Containers" +const containerInventoryPath = "/var/opt/microsoft/docker-cimprov/state/ContainerInventory" +const defaultContainerInventoryRefreshInterval = 60 +const defaultKubeSystemContainersRefreshInterval = 300 + +var ( + // PluginConfiguration the plugins configuration + PluginConfiguration map[string]string + // HTTPClient for making POST requests to OMSEndpoint + HTTPClient http.Client + // OMSEndpoint ingestion endpoint + OMSEndpoint string + // Computer (Hostname) when ingesting into ContainerLog table + Computer string +) + +var ( + // ImageIDMap caches the container id to image mapping + ImageIDMap map[string]string + // NameIDMap caches the container it to Name mapping + NameIDMap map[string]string + // IgnoreIDSet set of container Ids of kube-system pods + IgnoreIDSet map[string]bool + + // DataUpdateMutex read and write mutex access to the container id set + DataUpdateMutex = &sync.Mutex{} +) + +var ( + // FLBLogger stream + FLBLogger = createLogger() + + // Log wrapper function + Log = FLBLogger.Printf +) + +// ContainerInventory represents the container info +type ContainerInventory struct { + ElementName string `json:"ElementName"` + CreatedTime string `json:"CreatedTime"` + State string `json:"State"` + ExitCode int `json:"ExitCode"` + StartedTime string `json:"StartedTime"` + FinishedTime string `json:"FinishedTime"` + ImageID string `json:"ImageId"` + Image string `json:"Image"` + Repository string `json:"Repository"` + ImageTag string `json:"ImageTag"` + ComposeGroup string `json:"ComposeGroup"` + ContainerHostname string `json:"ContainerHostname"` + Computer string `json:"Computer"` + Command string `json:"Command"` + EnvironmentVar string `json:"EnvironmentVar"` + Ports string `json:"Ports"` + Links string `json:"Links"` +} + +// DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin +type DataItem struct { + LogEntry string `json:"LogEntry"` + LogEntrySource string `json:"LogEntrySource"` + LogEntryTimeStamp string `json:"LogEntryTimeStamp"` + ID string `json:"Id"` + Image string `json:"Image"` + Name string `json:"Name"` + SourceSystem string `json:"SourceSystem"` + Computer string `json:"Computer"` + Filepath string `json:"Filepath"` +} + +// ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point +type ContainerLogBlob struct { + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []DataItem `json:"DataItems"` +} + +func populateMaps() { + + Log("Updating ImageIDMap and NameIDMap") + + _imageIDMap := make(map[string]string) + _nameIDMap := make(map[string]string) + files, err := ioutil.ReadDir(containerInventoryPath) + + if err != nil { + Log("error when reading container inventory %s\n", err.Error()) + } + + for _, file := range files { + fullPath := fmt.Sprintf("%s/%s", containerInventoryPath, file.Name()) + fileContent, err := ioutil.ReadFile(fullPath) + if err != nil { + Log("Error reading file content %s", fullPath) + Log(err.Error()) + } + var containerInventory ContainerInventory + unmarshallErr := json.Unmarshal(fileContent, &containerInventory) + + if unmarshallErr != nil { + Log("Unmarshall error when reading file %s %s \n", fullPath, unmarshallErr.Error()) + } + + _imageIDMap[file.Name()] = containerInventory.Image + _nameIDMap[file.Name()] = containerInventory.ElementName + } + Log("Locking to update image and name maps") + DataUpdateMutex.Lock() + ImageIDMap = _imageIDMap + NameIDMap = _nameIDMap + DataUpdateMutex.Unlock() + Log("Unlocking after updating image and name maps") +} + +func createLogger() *log.Logger { + + var logfile *os.File + path := "/var/opt/microsoft/docker-cimprov/log/fluent-bit-out-oms-runtime.log" + if _, err := os.Stat(path); err == nil { + fmt.Printf("File Exists. Opening file in append mode...\n") + logfile, err = os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0600) + if err != nil { + fmt.Printf(err.Error()) + } + } + + if _, err := os.Stat(path); os.IsNotExist(err) { + fmt.Printf("File Doesnt Exist. Creating file...\n") + logfile, err = os.Create(path) + if err != nil { + fmt.Printf(err.Error()) + } + } + + logger := log.New(logfile, "", 0) + + logger.SetOutput(&lumberjack.Logger{ + Filename: path, + MaxSize: 10, //megabytes + MaxBackups: 3, + MaxAge: 28, //days + Compress: true, // false by default + }) + + logger.SetFlags(log.Ltime | log.Lshortfile | log.LstdFlags) + return logger +} + +func updateContainersData() { + + containerInventoryRefreshInterval, err := strconv.Atoi(PluginConfiguration["container_inventory_refresh_interval"]) + if err != nil { + Log("Error Reading Container Inventory Refresh Interval %s", err.Error()) + containerInventoryRefreshInterval = defaultContainerInventoryRefreshInterval + } + Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) + go initMaps(containerInventoryRefreshInterval) + + kubeSystemContainersRefreshInterval, err := strconv.Atoi(PluginConfiguration["kube_system_containers_refresh_interval"]) + if err != nil { + Log("Error Reading Kube System Container Ids Refresh Interval %s", err.Error()) + kubeSystemContainersRefreshInterval = defaultKubeSystemContainersRefreshInterval + } + Log("kubeSystemContainersRefreshInterval = %d \n", kubeSystemContainersRefreshInterval) + + go updateIgnoreContainerIds(kubeSystemContainersRefreshInterval) +} + +func initMaps(refreshInterval int) { + ImageIDMap = make(map[string]string) + NameIDMap = make(map[string]string) + + populateMaps() + + for range time.Tick(time.Second * time.Duration(refreshInterval)) { + populateMaps() + } +} + +func updateIgnoreContainerIds(refreshInterval int) { + IgnoreIDSet = make(map[string]bool) + + updateKubeSystemContainerIDs() + + for range time.Tick(time.Second * time.Duration(refreshInterval)) { + updateKubeSystemContainerIDs() + } +} + +func updateKubeSystemContainerIDs() { + + if strings.Compare(os.Getenv("DISABLE_KUBE_SYSTEM_LOG_COLLECTION"), "true") != 0 { + Log("Kube System Log Collection is ENABLED.") + return + } + + Log("Kube System Log Collection is DISABLED. Collecting containerIds to drop their records") + config, err := rest.InClusterConfig() + if err != nil { + Log("Error getting config %s\n", err.Error()) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + Log("Error getting clientset %s", err.Error()) + } + + pods, err := clientset.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + if err != nil { + Log("Error getting pods %s\n", err.Error()) + } + + _ignoreIDSet := make(map[string]bool) + for _, pod := range pods.Items { + for _, status := range pod.Status.ContainerStatuses { + lastSlashIndex := strings.LastIndex(status.ContainerID, "/") + _ignoreIDSet[status.ContainerID[lastSlashIndex+1:len(status.ContainerID)]] = true + } + } + + Log("Locking to update kube-system container IDs") + DataUpdateMutex.Lock() + IgnoreIDSet = _ignoreIDSet + DataUpdateMutex.Unlock() + Log("Unlocking after updating kube-system container IDs") +} + +// PostDataHelper sends data to the OMS endpoint +func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { + + start := time.Now() + var dataItems []DataItem + DataUpdateMutex.Lock() + + for _, record := range tailPluginRecords { + + containerID := getContainerIDFromFilePath(toString(record["Filepath"])) + + if containsKey(IgnoreIDSet, containerID) { + continue + } + + var dataItem DataItem + stringMap := make(map[string]string) + + // convert map[interface{}]interface{} to map[string]string + for key, value := range record { + strKey := fmt.Sprintf("%v", key) + strValue := toString(value) + stringMap[strKey] = strValue + } + + stringMap["Id"] = containerID + stringMap["Image"] = ImageIDMap[containerID] + stringMap["Name"] = NameIDMap[containerID] + stringMap["Computer"] = Computer + mapstructure.Decode(stringMap, &dataItem) + dataItems = append(dataItems, dataItem) + } + DataUpdateMutex.Unlock() + + if len(dataItems) > 0 { + logEntry := ContainerLogBlob{ + DataType: DataType, + IPName: IPName, + DataItems: dataItems} + + marshalled, err := json.Marshal(logEntry) + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) + req.Header.Set("Content-Type", "application/json") + + resp, err := HTTPClient.Do(req) + elapsed := time.Since(start) + + if err != nil { + Log("Error when sending request %s \n", err.Error()) + Log("Failed to flush %d records after %s", len(dataItems), elapsed) + return output.FLB_RETRY + } + + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("Status %s Status Code %d", resp.Status, resp.StatusCode) + } + return output.FLB_RETRY + } + + Log("Successfully flushed %d records in %s", len(dataItems), elapsed) + } + + return output.FLB_OK +} + +func containsKey(currentMap map[string]bool, key string) bool { + _, c := currentMap[key] + return c +} + +func toString(s interface{}) string { + value := s.([]uint8) + return string([]byte(value[:])) +} + +func getContainerIDFromFilePath(filepath string) string { + start := strings.LastIndex(filepath, "-") + end := strings.LastIndex(filepath, ".") + return filepath[start+1 : end] +} + +// ReadConfig reads and populates plugin configuration +func ReadConfig(pluginConfPath string) map[string]string { + + pluginConf, err := ReadConfiguration(pluginConfPath) + omsadminConf, err := ReadConfiguration(pluginConf["omsadmin_conf_path"]) + + if err != nil { + Log(err.Error()) + } + + containerHostName, err := ioutil.ReadFile(pluginConf["container_host_file_path"]) + if err != nil { + Log("Error when reading containerHostName file %s", err.Error()) + } + + Computer = strings.TrimSuffix(toString(containerHostName), "\n") + Log("Computer == %s \n", Computer) + + OMSEndpoint = omsadminConf["OMS_ENDPOINT"] + Log("OMSEndpoint %s", OMSEndpoint) + + return pluginConf +} diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go new file mode 100644 index 000000000..dad0ede81 --- /dev/null +++ b/source/code/go/src/plugins/out_oms.go @@ -0,0 +1,57 @@ +package main + +import ( + "github.com/fluent/fluent-bit-go/output" +) +import ( + "C" + "unsafe" +) + +//export FLBPluginRegister +func FLBPluginRegister(ctx unsafe.Pointer) int { + return output.FLBPluginRegister(ctx, "oms", "Stdout GO!") +} + +//export FLBPluginInit +// (fluentbit will call this) +// ctx (context) pointer to fluentbit context (state/ c code) +func FLBPluginInit(ctx unsafe.Pointer) int { + Log("Initializing out_oms go plugin for fluentbit") + PluginConfiguration = ReadConfig("/etc/opt/microsoft/docker-cimprov/out_oms.conf") + CreateHTTPClient() + updateContainersData() + return output.FLB_OK +} + +//export FLBPluginFlush +func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { + var count int + var ret int + var record map[interface{}]interface{} + var records []map[interface{}]interface{} + + // Create Fluent Bit decoder + dec := output.NewDecoder(data, int(length)) + + // Iterate Records + count = 0 + for { + // Extract Record + ret, _, record = output.GetRecord(dec) + if ret != 0 { + break + } + records = append(records, record) + count++ + } + return PostDataHelper(records) +} + +// FLBPluginExit exits the plugin +func FLBPluginExit() int { + return output.FLB_OK +} + +func main() { +} diff --git a/source/code/go/src/plugins/utils.go b/source/code/go/src/plugins/utils.go new file mode 100644 index 000000000..0e33f43f9 --- /dev/null +++ b/source/code/go/src/plugins/utils.go @@ -0,0 +1,67 @@ +package main + +import ( + "bufio" + "crypto/tls" + "log" + "net/http" + "os" + "strings" +) + +// ReadConfiguration reads a property file +func ReadConfiguration(filename string) (map[string]string, error) { + config := map[string]string{} + + if len(filename) == 0 { + return config, nil + } + + file, err := os.Open(filename) + if err != nil { + log.Fatal(err) + return nil, err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + currentLine := scanner.Text() + if equalIndex := strings.Index(currentLine, "="); equalIndex >= 0 { + if key := strings.TrimSpace(currentLine[:equalIndex]); len(key) > 0 { + value := "" + if len(currentLine) > equalIndex { + value = strings.TrimSpace(currentLine[equalIndex+1:]) + } + config[key] = value + } + } + } + + if err := scanner.Err(); err != nil { + log.Fatal(err) + return nil, err + } + + return config, nil +} + +// CreateHTTPClient used to create the client for sending post requests to OMSEndpoint +func CreateHTTPClient() { + + cert, err := tls.LoadX509KeyPair(PluginConfiguration["cert_file_path"], PluginConfiguration["key_file_path"]) + if err != nil { + Log("Error when loading cert %s", err.Error()) + } + + tlsConfig := &tls.Config{ + Certificates: []tls.Certificate{cert}, + } + + tlsConfig.BuildNameToCertificate() + transport := &http.Transport{TLSClientConfig: tlsConfig} + + HTTPClient = http.Client{Transport: transport} + + Log("Successfully created HTTP Client") +} From b02f2ec57e47c68648596ef7487bf320fa5e9331 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 14 Sep 2018 11:24:12 -0700 Subject: [PATCH 007/160] Dilipr/glide updates (#127) * Updating glide.* files to include lumberjack --- source/code/go/src/plugins/glide.lock | 6 ++++-- source/code/go/src/plugins/glide.yaml | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/source/code/go/src/plugins/glide.lock b/source/code/go/src/plugins/glide.lock index 79745820b..4597b594a 100644 --- a/source/code/go/src/plugins/glide.lock +++ b/source/code/go/src/plugins/glide.lock @@ -1,5 +1,5 @@ -hash: a4b073d827b5cbb4a772dada9ff3bcf55c55afc3cda83ddec1e6edcdca8e219a -updated: 2018-09-06T04:07:01.808678175Z +hash: bb32415f402ab29751f29b8e394bc974cbc31861453d817aaeb94ef83dacc488 +updated: 2018-09-14T18:14:28.748047598Z imports: - name: github.com/fluent/fluent-bit-go version: c4a158a6e3a793166c6ecfa2d5c80d71eada8959 @@ -83,6 +83,8 @@ imports: - rate - name: gopkg.in/inf.v0 version: 3887ee99ecf07df5b447e9b00d9c0b2adaa9f3e4 +- name: gopkg.in/natefinch/lumberjack.v2 + version: a96e63847dc3c67d17befa69c303767e2f84e54f - name: gopkg.in/yaml.v2 version: 670d4cfef0544295bc27a114dbac37980d83185a - name: k8s.io/api diff --git a/source/code/go/src/plugins/glide.yaml b/source/code/go/src/plugins/glide.yaml index b986ece21..403e1efc4 100644 --- a/source/code/go/src/plugins/glide.yaml +++ b/source/code/go/src/plugins/glide.yaml @@ -5,6 +5,8 @@ import: - output - package: github.com/mitchellh/mapstructure version: ^1.0.0 +- package: gopkg.in/natefinch/lumberjack.v2 + version: ^2.1.0 - package: k8s.io/apimachinery subpackages: - pkg/apis/meta/v1 From e01c67845cd5d99f77b8dafd3e579d933984c3af Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Mon, 17 Sep 2018 15:42:01 -0700 Subject: [PATCH 008/160] containerID="" for pull issues --- source/code/plugin/in_kube_podinventory.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index f478705f6..2cd1e1bc3 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -143,7 +143,8 @@ def parse_and_emit_records(podInventory, serviceList) if !container['containerID'].nil? record['ContainerID'] = container['containerID'].split("//")[1] else - record['ContainerID'] = "00000000-0000-0000-0000-000000000000" + # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 + record['ContainerID'] = "" end #keeping this as which is same as InstanceName in perf table record['ContainerName'] = podUid + "/" +container['name'] From b0ba22deaf43c29058d61f0dd76c2c64c34f5ac4 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 18 Sep 2018 16:59:46 -0700 Subject: [PATCH 009/160] Using KubeAPI for getting image,name. Adding more logs (#129) * Using KubeAPI for getting image,name. Adding more logs * Moving log file and state file to within the omsagent container * Changing log and state paths --- installer/conf/td-agent-bit.conf | 4 +- source/code/go/src/plugins/oms.go | 105 +++++++++++++------------- source/code/go/src/plugins/out_oms.go | 2 +- 3 files changed, 54 insertions(+), 57 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index cf490c077..84a9fcf94 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -2,13 +2,13 @@ Flush 5 Log_Level info Parsers_File /etc/td-agent-bit/parsers.conf - Log_File /var/log/fluent-bit.log + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log [INPUT] Name tail Tag oms.container.log.* Path /var/log/containers/*.log - DB /var/log/fblogs.db + DB /var/opt/microsoft/docker-cimprov/state/fblogs.db Parser docker Mem_Buf_Limit 30m Path_Key filepath diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 49472c74b..c18135dcc 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -26,7 +26,6 @@ const DataType = "CONTAINER_LOG_BLOB" // IPName for Container Log const IPName = "Containers" -const containerInventoryPath = "/var/opt/microsoft/docker-cimprov/state/ContainerInventory" const defaultContainerInventoryRefreshInterval = 60 const defaultKubeSystemContainersRefreshInterval = 300 @@ -51,6 +50,9 @@ var ( // DataUpdateMutex read and write mutex access to the container id set DataUpdateMutex = &sync.Mutex{} + + // ClientSet for querying KubeAPIs + ClientSet *kubernetes.Clientset ) var ( @@ -61,27 +63,6 @@ var ( Log = FLBLogger.Printf ) -// ContainerInventory represents the container info -type ContainerInventory struct { - ElementName string `json:"ElementName"` - CreatedTime string `json:"CreatedTime"` - State string `json:"State"` - ExitCode int `json:"ExitCode"` - StartedTime string `json:"StartedTime"` - FinishedTime string `json:"FinishedTime"` - ImageID string `json:"ImageId"` - Image string `json:"Image"` - Repository string `json:"Repository"` - ImageTag string `json:"ImageTag"` - ComposeGroup string `json:"ComposeGroup"` - ContainerHostname string `json:"ContainerHostname"` - Computer string `json:"Computer"` - Command string `json:"Command"` - EnvironmentVar string `json:"EnvironmentVar"` - Ports string `json:"Ports"` - Links string `json:"Links"` -} - // DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin type DataItem struct { LogEntry string `json:"LogEntry"` @@ -108,29 +89,25 @@ func populateMaps() { _imageIDMap := make(map[string]string) _nameIDMap := make(map[string]string) - files, err := ioutil.ReadDir(containerInventoryPath) + pods, err := ClientSet.CoreV1().Pods("").List(metav1.ListOptions{}) if err != nil { - Log("error when reading container inventory %s\n", err.Error()) + Log("Error getting pods %s\n", err.Error()) } - for _, file := range files { - fullPath := fmt.Sprintf("%s/%s", containerInventoryPath, file.Name()) - fileContent, err := ioutil.ReadFile(fullPath) - if err != nil { - Log("Error reading file content %s", fullPath) - Log(err.Error()) - } - var containerInventory ContainerInventory - unmarshallErr := json.Unmarshal(fileContent, &containerInventory) - - if unmarshallErr != nil { - Log("Unmarshall error when reading file %s %s \n", fullPath, unmarshallErr.Error()) + for _, pod := range pods.Items { + for _, status := range pod.Status.ContainerStatuses { + lastSlashIndex := strings.LastIndex(status.ContainerID, "/") + containerID := status.ContainerID[lastSlashIndex+1 : len(status.ContainerID)] + image := status.Image + name := fmt.Sprintf("%s/%s", pod.UID, status.Name) + if containerID != "" { + _imageIDMap[containerID] = image + _nameIDMap[containerID] = name + } } - - _imageIDMap[file.Name()] = containerInventory.Image - _nameIDMap[file.Name()] = containerInventory.ElementName } + Log("Locking to update image and name maps") DataUpdateMutex.Lock() ImageIDMap = _imageIDMap @@ -164,7 +141,7 @@ func createLogger() *log.Logger { logger.SetOutput(&lumberjack.Logger{ Filename: path, MaxSize: 10, //megabytes - MaxBackups: 3, + MaxBackups: 1, MaxAge: 28, //days Compress: true, // false by default }) @@ -222,17 +199,8 @@ func updateKubeSystemContainerIDs() { } Log("Kube System Log Collection is DISABLED. Collecting containerIds to drop their records") - config, err := rest.InClusterConfig() - if err != nil { - Log("Error getting config %s\n", err.Error()) - } - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - Log("Error getting clientset %s", err.Error()) - } - - pods, err := clientset.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + pods, err := ClientSet.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) if err != nil { Log("Error getting pods %s\n", err.Error()) } @@ -278,8 +246,27 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } stringMap["Id"] = containerID - stringMap["Image"] = ImageIDMap[containerID] - stringMap["Name"] = NameIDMap[containerID] + + if val, ok := ImageIDMap[containerID]; ok { + stringMap["Image"] = val + } else { + Log("ContainerId %s not present in Map ", containerID) + Log("CurrentMap Snapshot \n") + for k, v := range ImageIDMap { + Log("%s ==> %s", k, v) + } + } + + if val, ok := NameIDMap[containerID]; ok { + stringMap["Name"] = val + } else { + Log("ContainerId %s not present in Map ", containerID) + Log("CurrentMap Snapshot \n") + for k, v := range NameIDMap { + Log("%s ==> %s", k, v) + } + } + stringMap["Computer"] = Computer mapstructure.Decode(stringMap, &dataItem) dataItems = append(dataItems, dataItem) @@ -334,8 +321,8 @@ func getContainerIDFromFilePath(filepath string) string { return filepath[start+1 : end] } -// ReadConfig reads and populates plugin configuration -func ReadConfig(pluginConfPath string) map[string]string { +// InitializeConfig reads and populates plugin configuration +func InitializeConfig(pluginConfPath string) map[string]string { pluginConf, err := ReadConfiguration(pluginConfPath) omsadminConf, err := ReadConfiguration(pluginConf["omsadmin_conf_path"]) @@ -355,5 +342,15 @@ func ReadConfig(pluginConfPath string) map[string]string { OMSEndpoint = omsadminConf["OMS_ENDPOINT"] Log("OMSEndpoint %s", OMSEndpoint) + config, err := rest.InClusterConfig() + if err != nil { + Log("Error getting config %s\n", err.Error()) + } + + ClientSet, err = kubernetes.NewForConfig(config) + if err != nil { + Log("Error getting clientset %s", err.Error()) + } + return pluginConf } diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index dad0ede81..8c23f47a8 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -18,7 +18,7 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { // ctx (context) pointer to fluentbit context (state/ c code) func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") - PluginConfiguration = ReadConfig("/etc/opt/microsoft/docker-cimprov/out_oms.conf") + PluginConfiguration = InitializeConfig("/etc/opt/microsoft/docker-cimprov/out_oms.conf") CreateHTTPClient() updateContainersData() return output.FLB_OK From 97834199721172ba0a67828b19a6f26de1a4b0a0 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 27 Sep 2018 14:35:29 -0700 Subject: [PATCH 010/160] Dilipr/mark comments (#130) * Marks Comments + Error Handling * Drop records from files that are not in k8s format * Remove unnecessary log line' * Adding Log to the file that doesn't conform to the expected format --- source/code/go/src/plugins/oms.go | 227 ++++++++++++++------------ source/code/go/src/plugins/out_oms.go | 6 +- source/code/go/src/plugins/utils.go | 1 + 3 files changed, 123 insertions(+), 111 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index c18135dcc..2e9e2f3d0 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -12,7 +12,8 @@ import ( "strings" "sync" "time" - +) +import ( "github.com/fluent/fluent-bit-go/output" "github.com/mitchellh/mapstructure" lumberjack "gopkg.in/natefinch/lumberjack.v2" @@ -24,6 +25,9 @@ import ( // DataType for Container Log const DataType = "CONTAINER_LOG_BLOB" +// ContainerLogPluginConfFilePath --> config file path for container log plugin +const ContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" + // IPName for Container Log const IPName = "Containers" const defaultContainerInventoryRefreshInterval = 60 @@ -47,18 +51,22 @@ var ( NameIDMap map[string]string // IgnoreIDSet set of container Ids of kube-system pods IgnoreIDSet map[string]bool - // DataUpdateMutex read and write mutex access to the container id set DataUpdateMutex = &sync.Mutex{} - // ClientSet for querying KubeAPIs ClientSet *kubernetes.Clientset ) +var ( + // KubeSystemContainersRefreshTicker updates the kube-system containers + KubeSystemContainersRefreshTicker = time.NewTicker(time.Second * 300) + // ContainerImageNameRefreshTicker updates the container image and names periodically + ContainerImageNameRefreshTicker = time.NewTicker(time.Second * 60) +) + var ( // FLBLogger stream FLBLogger = createLogger() - // Log wrapper function Log = FLBLogger.Printf ) @@ -83,41 +91,7 @@ type ContainerLogBlob struct { DataItems []DataItem `json:"DataItems"` } -func populateMaps() { - - Log("Updating ImageIDMap and NameIDMap") - - _imageIDMap := make(map[string]string) - _nameIDMap := make(map[string]string) - - pods, err := ClientSet.CoreV1().Pods("").List(metav1.ListOptions{}) - if err != nil { - Log("Error getting pods %s\n", err.Error()) - } - - for _, pod := range pods.Items { - for _, status := range pod.Status.ContainerStatuses { - lastSlashIndex := strings.LastIndex(status.ContainerID, "/") - containerID := status.ContainerID[lastSlashIndex+1 : len(status.ContainerID)] - image := status.Image - name := fmt.Sprintf("%s/%s", pod.UID, status.Name) - if containerID != "" { - _imageIDMap[containerID] = image - _nameIDMap[containerID] = name - } - } - } - - Log("Locking to update image and name maps") - DataUpdateMutex.Lock() - ImageIDMap = _imageIDMap - NameIDMap = _nameIDMap - DataUpdateMutex.Unlock() - Log("Unlocking after updating image and name maps") -} - func createLogger() *log.Logger { - var logfile *os.File path := "/var/opt/microsoft/docker-cimprov/log/fluent-bit-out-oms-runtime.log" if _, err := os.Stat(path); err == nil { @@ -150,88 +124,85 @@ func createLogger() *log.Logger { return logger } -func updateContainersData() { +func updateContainerImageNameMaps() { + for ; true; <-ContainerImageNameRefreshTicker.C { + Log("Updating ImageIDMap and NameIDMap") - containerInventoryRefreshInterval, err := strconv.Atoi(PluginConfiguration["container_inventory_refresh_interval"]) - if err != nil { - Log("Error Reading Container Inventory Refresh Interval %s", err.Error()) - containerInventoryRefreshInterval = defaultContainerInventoryRefreshInterval - } - Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) - go initMaps(containerInventoryRefreshInterval) + _imageIDMap := make(map[string]string) + _nameIDMap := make(map[string]string) - kubeSystemContainersRefreshInterval, err := strconv.Atoi(PluginConfiguration["kube_system_containers_refresh_interval"]) - if err != nil { - Log("Error Reading Kube System Container Ids Refresh Interval %s", err.Error()) - kubeSystemContainersRefreshInterval = defaultKubeSystemContainersRefreshInterval - } - Log("kubeSystemContainersRefreshInterval = %d \n", kubeSystemContainersRefreshInterval) - - go updateIgnoreContainerIds(kubeSystemContainersRefreshInterval) -} - -func initMaps(refreshInterval int) { - ImageIDMap = make(map[string]string) - NameIDMap = make(map[string]string) - - populateMaps() - - for range time.Tick(time.Second * time.Duration(refreshInterval)) { - populateMaps() - } -} - -func updateIgnoreContainerIds(refreshInterval int) { - IgnoreIDSet = make(map[string]bool) + pods, err := ClientSet.CoreV1().Pods("").List(metav1.ListOptions{}) + if err != nil { + Log("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + } - updateKubeSystemContainerIDs() + for _, pod := range pods.Items { + for _, status := range pod.Status.ContainerStatuses { + lastSlashIndex := strings.LastIndex(status.ContainerID, "/") + containerID := status.ContainerID[lastSlashIndex+1 : len(status.ContainerID)] + image := status.Image + name := fmt.Sprintf("%s/%s", pod.UID, status.Name) + if containerID != "" { + _imageIDMap[containerID] = image + _nameIDMap[containerID] = name + } + } + } - for range time.Tick(time.Second * time.Duration(refreshInterval)) { - updateKubeSystemContainerIDs() + Log("Locking to update image and name maps") + DataUpdateMutex.Lock() + ImageIDMap = _imageIDMap + NameIDMap = _nameIDMap + DataUpdateMutex.Unlock() + Log("Unlocking after updating image and name maps") } } func updateKubeSystemContainerIDs() { + for ; true; <-KubeSystemContainersRefreshTicker.C { + if strings.Compare(os.Getenv("DISABLE_KUBE_SYSTEM_LOG_COLLECTION"), "true") != 0 { + Log("Kube System Log Collection is ENABLED.") + return + } - if strings.Compare(os.Getenv("DISABLE_KUBE_SYSTEM_LOG_COLLECTION"), "true") != 0 { - Log("Kube System Log Collection is ENABLED.") - return - } - - Log("Kube System Log Collection is DISABLED. Collecting containerIds to drop their records") + Log("Kube System Log Collection is DISABLED. Collecting containerIds to drop their records") - pods, err := ClientSet.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) - if err != nil { - Log("Error getting pods %s\n", err.Error()) - } + pods, err := ClientSet.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) + if err != nil { + Log("Error getting pods %s\nIt is ok to log here and continue. Kube-system logs will be collected", err.Error()) + } - _ignoreIDSet := make(map[string]bool) - for _, pod := range pods.Items { - for _, status := range pod.Status.ContainerStatuses { - lastSlashIndex := strings.LastIndex(status.ContainerID, "/") - _ignoreIDSet[status.ContainerID[lastSlashIndex+1:len(status.ContainerID)]] = true + _ignoreIDSet := make(map[string]bool) + for _, pod := range pods.Items { + for _, status := range pod.Status.ContainerStatuses { + lastSlashIndex := strings.LastIndex(status.ContainerID, "/") + _ignoreIDSet[status.ContainerID[lastSlashIndex+1:len(status.ContainerID)]] = true + } } - } - Log("Locking to update kube-system container IDs") - DataUpdateMutex.Lock() - IgnoreIDSet = _ignoreIDSet - DataUpdateMutex.Unlock() - Log("Unlocking after updating kube-system container IDs") + Log("Locking to update kube-system container IDs") + DataUpdateMutex.Lock() + IgnoreIDSet = _ignoreIDSet + DataUpdateMutex.Unlock() + Log("Unlocking after updating kube-system container IDs") + } } // PostDataHelper sends data to the OMS endpoint func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { + defer DataUpdateMutex.Unlock() + start := time.Now() var dataItems []DataItem DataUpdateMutex.Lock() for _, record := range tailPluginRecords { - containerID := getContainerIDFromFilePath(toString(record["Filepath"])) + filepath := toString(record["Filepath"]) + containerID := getContainerIDFromFilePath(filepath) - if containsKey(IgnoreIDSet, containerID) { + if containerID == "" || containsKey(IgnoreIDSet, containerID) { continue } @@ -271,7 +242,6 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { mapstructure.Decode(stringMap, &dataItem) dataItems = append(dataItems, dataItem) } - DataUpdateMutex.Unlock() if len(dataItems) > 0 { logEntry := ContainerLogBlob{ @@ -318,39 +288,80 @@ func toString(s interface{}) string { func getContainerIDFromFilePath(filepath string) string { start := strings.LastIndex(filepath, "-") end := strings.LastIndex(filepath, ".") + if start >= end || start == -1 || end == -1 { + // This means the file is not a managed Kubernetes docker log file. + // Drop all records from the file + Log("File %s is not a Kubernetes managed docker log file. Dropping all records from the file", filepath) + return "" + } return filepath[start+1 : end] } -// InitializeConfig reads and populates plugin configuration -func InitializeConfig(pluginConfPath string) map[string]string { +// InitializePlugin reads and populates plugin configuration +func InitializePlugin(pluginConfPath string) { + + IgnoreIDSet = make(map[string]bool) + ImageIDMap = make(map[string]string) + NameIDMap = make(map[string]string) - pluginConf, err := ReadConfiguration(pluginConfPath) - omsadminConf, err := ReadConfiguration(pluginConf["omsadmin_conf_path"]) + pluginConfig, err := ReadConfiguration(pluginConfPath) + if err != nil { + Log("Error Reading plugin config path : %s \n", err.Error()) + log.Fatalf("Error Reading plugin config path : %s \n", err.Error()) + } + omsadminConf, err := ReadConfiguration(pluginConfig["omsadmin_conf_path"]) if err != nil { Log(err.Error()) + log.Fatalf("Error Reading omsadmin configuration %s\n", err.Error()) } + OMSEndpoint = omsadminConf["OMS_ENDPOINT"] + Log("OMSEndpoint %s", OMSEndpoint) - containerHostName, err := ioutil.ReadFile(pluginConf["container_host_file_path"]) + // Initialize image,name map refresh ticker + containerInventoryRefreshInterval, err := strconv.Atoi(pluginConfig["container_inventory_refresh_interval"]) if err != nil { - Log("Error when reading containerHostName file %s", err.Error()) + Log("Error Reading Container Inventory Refresh Interval %s", err.Error()) + Log("Using Default Refresh Interval of %d s\n", defaultContainerInventoryRefreshInterval) + containerInventoryRefreshInterval = defaultContainerInventoryRefreshInterval } + Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) + ContainerImageNameRefreshTicker = time.NewTicker(time.Second * time.Duration(containerInventoryRefreshInterval)) + // Initialize Kube System Refresh Ticker + kubeSystemContainersRefreshInterval, err := strconv.Atoi(pluginConfig["kube_system_containers_refresh_interval"]) + if err != nil { + Log("Error Reading Kube System Container Ids Refresh Interval %s", err.Error()) + Log("Using Default Refresh Interval of %d s\n", defaultKubeSystemContainersRefreshInterval) + kubeSystemContainersRefreshInterval = defaultKubeSystemContainersRefreshInterval + } + Log("kubeSystemContainersRefreshInterval = %d \n", kubeSystemContainersRefreshInterval) + KubeSystemContainersRefreshTicker = time.NewTicker(time.Second * time.Duration(kubeSystemContainersRefreshInterval)) + + // Populate Computer field + containerHostName, err := ioutil.ReadFile(pluginConfig["container_host_file_path"]) + if err != nil { + // It is ok to log here and continue, because only the Computer column will be missing, + // which can be deduced from a combination of containerId, and docker logs on the node + Log("Error when reading containerHostName file %s.\n It is ok to log here and continue, because only the Computer column will be missing, which can be deduced from a combination of containerId, and docker logs on the nodes\n", err.Error()) + } Computer = strings.TrimSuffix(toString(containerHostName), "\n") Log("Computer == %s \n", Computer) - OMSEndpoint = omsadminConf["OMS_ENDPOINT"] - Log("OMSEndpoint %s", OMSEndpoint) - + // Initialize KubeAPI Client config, err := rest.InClusterConfig() if err != nil { - Log("Error getting config %s\n", err.Error()) + Log("Error getting config %s.\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) } ClientSet, err = kubernetes.NewForConfig(config) if err != nil { - Log("Error getting clientset %s", err.Error()) + Log("Error getting clientset %s.\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) } - return pluginConf + PluginConfiguration = pluginConfig + + CreateHTTPClient() + go updateKubeSystemContainerIDs() + go updateContainerImageNameMaps() } diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 8c23f47a8..ec9a573d1 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -18,9 +18,7 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { // ctx (context) pointer to fluentbit context (state/ c code) func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") - PluginConfiguration = InitializeConfig("/etc/opt/microsoft/docker-cimprov/out_oms.conf") - CreateHTTPClient() - updateContainersData() + InitializePlugin(ContainerLogPluginConfFilePath) return output.FLB_OK } @@ -50,6 +48,8 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // FLBPluginExit exits the plugin func FLBPluginExit() int { + KubeSystemContainersRefreshTicker.Stop() + ContainerImageNameRefreshTicker.Stop() return output.FLB_OK } diff --git a/source/code/go/src/plugins/utils.go b/source/code/go/src/plugins/utils.go index 0e33f43f9..1ac9b05a9 100644 --- a/source/code/go/src/plugins/utils.go +++ b/source/code/go/src/plugins/utils.go @@ -52,6 +52,7 @@ func CreateHTTPClient() { cert, err := tls.LoadX509KeyPair(PluginConfiguration["cert_file_path"], PluginConfiguration["key_file_path"]) if err != nil { Log("Error when loading cert %s", err.Error()) + log.Fatalf("Error when loading cert %s", err.Error()) } tlsConfig := &tls.Config{ From 8e35b7365bab9de6d087718887d5021167617a0d Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 27 Sep 2018 15:52:13 -0700 Subject: [PATCH 011/160] Rashmi/segfault latest (#132) * adding null checks in all providers * fixing type * fixing type * adding more null checks * update cjson --- source/code/cjson/cJSON.c | 3478 +++++++++++++---- source/code/cjson/cJSON.h | 398 +- ...iner_ContainerInventory_Class_Provider.cpp | 34 +- ...ner_ContainerStatistics_Class_Provider.cpp | 39 +- .../Container_DaemonEvent_Class_Provider.cpp | 6 +- ...ontainer_ImageInventory_Class_Provider.cpp | 19 +- .../Container_Process_Class_Provider.cpp | 2 +- 7 files changed, 3146 insertions(+), 830 deletions(-) diff --git a/source/code/cjson/cJSON.c b/source/code/cjson/cJSON.c index 77dbfe959..c561c7ceb 100755 --- a/source/code/cjson/cJSON.c +++ b/source/code/cjson/cJSON.c @@ -1,770 +1,2930 @@ /* - Copyright (c) 2009 Dave Gamble - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. +Copyright (c) 2009-2017 Dave Gamble and cJSON contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. */ /* cJSON */ /* JSON parser in C. */ +/* disable warnings about old C89 functions in MSVC */ +#if !defined(_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) +#define _CRT_SECURE_NO_DEPRECATE +#endif + +#ifdef __GNUC__ +#pragma GCC visibility push(default) +#endif +#if defined(_MSC_VER) +#pragma warning (push) +/* disable warning about single line comments in system headers */ +#pragma warning (disable : 4001) +#endif + #include #include #include #include -#include #include #include + +#ifdef ENABLE_LOCALES +#include +#endif + +#if defined(_MSC_VER) +#pragma warning (pop) +#endif +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif + #include "cJSON.h" -static const char *ep; -const char *cJSON_GetErrorPtr(void) {return ep;} +/* define our own boolean type */ +#define true ((cJSON_bool)1) +#define false ((cJSON_bool)0) -static int cJSON_strcasecmp(const char *s1,const char *s2) +typedef struct { + const unsigned char *json; + size_t position; +} error; +static error global_error = { NULL, 0 }; + +CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void) { - if (!s1) return (s1==s2)?0:1;if (!s2) return 1; - for(; tolower(*s1) == tolower(*s2); ++s1, ++s2) if(*s1 == 0) return 0; - return tolower(*(const unsigned char *)s1) - tolower(*(const unsigned char *)s2); + return (const char*)(global_error.json + global_error.position); } -static void *(*cJSON_malloc)(size_t sz) = malloc; -static void (*cJSON_free)(void *ptr) = free; +CJSON_PUBLIC(char *) cJSON_GetStringValue(cJSON *item) { + if (!cJSON_IsString(item)) { + return NULL; + } + + return item->valuestring; +} -static char* cJSON_strdup(const char* str) +/* This is a safeguard to prevent copy-pasters from using incompatible C and header files */ +#if (CJSON_VERSION_MAJOR != 1) || (CJSON_VERSION_MINOR != 7) || (CJSON_VERSION_PATCH != 8) +#error cJSON.h and cJSON.c have different versions. Make sure that both have the same. +#endif + +CJSON_PUBLIC(const char*) cJSON_Version(void) { - size_t len; - char* copy; + static char version[15]; + sprintf(version, "%i.%i.%i", CJSON_VERSION_MAJOR, CJSON_VERSION_MINOR, CJSON_VERSION_PATCH); - len = strlen(str) + 1; - if (!(copy = (char*)cJSON_malloc(len))) return 0; - memcpy(copy,str,len); - return copy; + return version; } -void cJSON_InitHooks(cJSON_Hooks* hooks) +/* Case insensitive string comparison, doesn't consider two NULL pointers equal though */ +static int case_insensitive_strcmp(const unsigned char *string1, const unsigned char *string2) { - if (!hooks) { /* Reset hooks */ - cJSON_malloc = malloc; - cJSON_free = free; - return; - } + if ((string1 == NULL) || (string2 == NULL)) + { + return 1; + } + + if (string1 == string2) + { + return 0; + } + + for (; tolower(*string1) == tolower(*string2); (void)string1++, string2++) + { + if (*string1 == '\0') + { + return 0; + } + } + + return tolower(*string1) - tolower(*string2); +} - cJSON_malloc = (hooks->malloc_fn)?hooks->malloc_fn:malloc; - cJSON_free = (hooks->free_fn)?hooks->free_fn:free; +typedef struct internal_hooks +{ + void *(CJSON_CDECL *allocate)(size_t size); + void (CJSON_CDECL *deallocate)(void *pointer); + void *(CJSON_CDECL *reallocate)(void *pointer, size_t size); +} internal_hooks; + +#if defined(_MSC_VER) +/* work around MSVC error C2322: '...' address of dillimport '...' is not static */ +static void * CJSON_CDECL internal_malloc(size_t size) +{ + return malloc(size); +} +static void CJSON_CDECL internal_free(void *pointer) +{ + free(pointer); +} +static void * CJSON_CDECL internal_realloc(void *pointer, size_t size) +{ + return realloc(pointer, size); } +#else +#define internal_malloc malloc +#define internal_free free +#define internal_realloc realloc +#endif -/* Internal constructor. */ -static cJSON *cJSON_New_Item(void) +static internal_hooks global_hooks = { internal_malloc, internal_free, internal_realloc }; + +static unsigned char* cJSON_strdup(const unsigned char* string, const internal_hooks * const hooks) { - cJSON* node = (cJSON*)cJSON_malloc(sizeof(cJSON)); - if (node) memset(node,0,sizeof(cJSON)); - return node; + size_t length = 0; + unsigned char *copy = NULL; + + if (string == NULL) + { + return NULL; + } + + length = strlen((const char*)string) + sizeof(""); + copy = (unsigned char*)hooks->allocate(length); + if (copy == NULL) + { + return NULL; + } + memcpy(copy, string, length); + + return copy; } +CJSON_PUBLIC(void) cJSON_InitHooks(cJSON_Hooks* hooks) +{ + if (hooks == NULL) + { + /* Reset hooks */ + global_hooks.allocate = malloc; + global_hooks.deallocate = free; + global_hooks.reallocate = realloc; + return; + } + + global_hooks.allocate = malloc; + if (hooks->malloc_fn != NULL) + { + global_hooks.allocate = hooks->malloc_fn; + } + + global_hooks.deallocate = free; + if (hooks->free_fn != NULL) + { + global_hooks.deallocate = hooks->free_fn; + } + + /* use realloc only if both free and malloc are used */ + global_hooks.reallocate = NULL; + if ((global_hooks.allocate == malloc) && (global_hooks.deallocate == free)) + { + global_hooks.reallocate = realloc; + } +} + +/* Internal constructor. */ +static cJSON *cJSON_New_Item(const internal_hooks * const hooks) +{ + cJSON* node = (cJSON*)hooks->allocate(sizeof(cJSON)); + if (node) + { + memset(node, '\0', sizeof(cJSON)); + } + + return node; +} /* Delete a cJSON structure. */ -void cJSON_Delete(cJSON *c) +CJSON_PUBLIC(void) cJSON_Delete(cJSON *item) { - cJSON *next; - while (c) - { - next=c->next; - if (!(c->type&cJSON_IsReference) && c->child) cJSON_Delete(c->child); - if (!(c->type&cJSON_IsReference) && c->valuestring) cJSON_free(c->valuestring); - if (!(c->type&cJSON_StringIsConst) && c->string) cJSON_free(c->string); - cJSON_free(c); - c=next; - } + cJSON *next = NULL; + while (item != NULL) + { + next = item->next; + if (!(item->type & cJSON_IsReference) && (item->child != NULL)) + { + cJSON_Delete(item->child); + } + if (!(item->type & cJSON_IsReference) && (item->valuestring != NULL)) + { + global_hooks.deallocate(item->valuestring); + } + if (!(item->type & cJSON_StringIsConst) && (item->string != NULL)) + { + global_hooks.deallocate(item->string); + } + global_hooks.deallocate(item); + item = next; + } } -/* Parse the input text to generate a number, and populate the result into item. */ -static const char *parse_number(cJSON *item,const char *num) +/* get the decimal point character of the current locale */ +static unsigned char get_decimal_point(void) { - double n=0,sign=1,scale=0;int subscale=0,signsubscale=1; +#ifdef ENABLE_LOCALES + struct lconv *lconv = localeconv(); + return (unsigned char)lconv->decimal_point[0]; +#else + return '.'; +#endif +} - if (*num=='-') sign=-1,num++; /* Has sign? */ - if (*num=='0') num++; /* is zero */ - if (*num>='1' && *num<='9') do n=(n*10.0)+(*num++ -'0'); while (*num>='0' && *num<='9'); /* Number? */ - if (*num=='.' && num[1]>='0' && num[1]<='9') {num++; do n=(n*10.0)+(*num++ -'0'),scale--; while (*num>='0' && *num<='9');} /* Fractional part? */ - if (*num=='e' || *num=='E') /* Exponent? */ - { num++;if (*num=='+') num++; else if (*num=='-') signsubscale=-1,num++; /* With sign? */ - while (*num>='0' && *num<='9') subscale=(subscale*10)+(*num++ - '0'); /* Number? */ - } +typedef struct +{ + const unsigned char *content; + size_t length; + size_t offset; + size_t depth; /* How deeply nested (in arrays/objects) is the input at the current offset. */ + internal_hooks hooks; +} parse_buffer; + +/* check if the given size is left to read in a given parse buffer (starting with 1) */ +#define can_read(buffer, size) ((buffer != NULL) && (((buffer)->offset + size) <= (buffer)->length)) +/* check if the buffer can be accessed at the given index (starting with 0) */ +#define can_access_at_index(buffer, index) ((buffer != NULL) && (((buffer)->offset + index) < (buffer)->length)) +#define cannot_access_at_index(buffer, index) (!can_access_at_index(buffer, index)) +/* get a pointer to the buffer at the position */ +#define buffer_at_offset(buffer) ((buffer)->content + (buffer)->offset) - n=sign*n*pow(10.0,(scale+subscale*signsubscale)); /* number = +/- number.fraction * 10^+/- exponent */ - - item->valuedouble=n; - item->valueint=(int)n; - item->type=cJSON_Number; - return num; +/* Parse the input text to generate a number, and populate the result into item. */ +static cJSON_bool parse_number(cJSON * const item, parse_buffer * const input_buffer) +{ + double number = 0; + unsigned char *after_end = NULL; + unsigned char number_c_string[64]; + unsigned char decimal_point = get_decimal_point(); + size_t i = 0; + + if ((input_buffer == NULL) || (input_buffer->content == NULL)) + { + return false; + } + + /* copy the number into a temporary buffer and replace '.' with the decimal point + * of the current locale (for strtod) + * This also takes care of '\0' not necessarily being available for marking the end of the input */ + for (i = 0; (i < (sizeof(number_c_string) - 1)) && can_access_at_index(input_buffer, i); i++) + { + switch (buffer_at_offset(input_buffer)[i]) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '+': + case '-': + case 'e': + case 'E': + number_c_string[i] = buffer_at_offset(input_buffer)[i]; + break; + + case '.': + number_c_string[i] = decimal_point; + break; + + default: + goto loop_end; + } + } +loop_end: + number_c_string[i] = '\0'; + + number = strtod((const char*)number_c_string, (char**)&after_end); + if (number_c_string == after_end) + { + return false; /* parse_error */ + } + + item->valuedouble = number; + + /* use saturation in case of overflow */ + if (number >= INT_MAX) + { + item->valueint = INT_MAX; + } + else if (number <= INT_MIN) + { + item->valueint = INT_MIN; + } + else + { + item->valueint = (int)number; + } + + item->type = cJSON_Number; + + input_buffer->offset += (size_t)(after_end - number_c_string); + return true; } -static int pow2gt (int x) { --x; x|=x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return x+1; } +/* don't ask me, but the original cJSON_SetNumberValue returns an integer or double */ +CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number) +{ + if (number >= INT_MAX) + { + object->valueint = INT_MAX; + } + else if (number <= INT_MIN) + { + object->valueint = INT_MIN; + } + else + { + object->valueint = (int)number; + } + + return object->valuedouble = number; +} -typedef struct {char *buffer; int length; int offset; } printbuffer; +typedef struct +{ + unsigned char *buffer; + size_t length; + size_t offset; + size_t depth; /* current nesting depth (for formatted printing) */ + cJSON_bool noalloc; + cJSON_bool format; /* is this print a formatted print */ + internal_hooks hooks; +} printbuffer; + +/* realloc printbuffer if necessary to have at least "needed" bytes more */ +static unsigned char* ensure(printbuffer * const p, size_t needed) +{ + unsigned char *newbuffer = NULL; + size_t newsize = 0; + + if ((p == NULL) || (p->buffer == NULL)) + { + return NULL; + } + + if ((p->length > 0) && (p->offset >= p->length)) + { + /* make sure that offset is valid */ + return NULL; + } + + if (needed > INT_MAX) + { + /* sizes bigger than INT_MAX are currently not supported */ + return NULL; + } + + needed += p->offset + 1; + if (needed <= p->length) + { + return p->buffer + p->offset; + } + + if (p->noalloc) { + return NULL; + } + + /* calculate new buffer size */ + if (needed > (INT_MAX / 2)) + { + /* overflow of int, use INT_MAX if possible */ + if (needed <= INT_MAX) + { + newsize = INT_MAX; + } + else + { + return NULL; + } + } + else + { + newsize = needed * 2; + } + + if (p->hooks.reallocate != NULL) + { + /* reallocate with realloc if available */ + newbuffer = (unsigned char*)p->hooks.reallocate(p->buffer, newsize); + if (newbuffer == NULL) + { + p->hooks.deallocate(p->buffer); + p->length = 0; + p->buffer = NULL; + + return NULL; + } + } + else + { + /* otherwise reallocate manually */ + newbuffer = (unsigned char*)p->hooks.allocate(newsize); + if (!newbuffer) + { + p->hooks.deallocate(p->buffer); + p->length = 0; + p->buffer = NULL; + + return NULL; + } + if (newbuffer) + { + memcpy(newbuffer, p->buffer, p->offset + 1); + } + p->hooks.deallocate(p->buffer); + } + p->length = newsize; + p->buffer = newbuffer; + + return newbuffer + p->offset; +} -static char* ensure(printbuffer *p,int needed) +/* calculate the new length of the string in a printbuffer and update the offset */ +static void update_offset(printbuffer * const buffer) { - char *newbuffer;int newsize; - if (!p || !p->buffer) return 0; - needed+=p->offset; - if (needed<=p->length) return p->buffer+p->offset; + const unsigned char *buffer_pointer = NULL; + if ((buffer == NULL) || (buffer->buffer == NULL)) + { + return; + } + buffer_pointer = buffer->buffer + buffer->offset; + + buffer->offset += strlen((const char*)buffer_pointer); +} - newsize=pow2gt(needed); - newbuffer=(char*)cJSON_malloc(newsize); - if (!newbuffer) {cJSON_free(p->buffer);p->length=0,p->buffer=0;return 0;} - if (newbuffer) memcpy(newbuffer,p->buffer,p->length); - cJSON_free(p->buffer); - p->length=newsize; - p->buffer=newbuffer; - return newbuffer+p->offset; +/* Render the number nicely from the given item into a string. */ +static cJSON_bool print_number(const cJSON * const item, printbuffer * const output_buffer) +{ + unsigned char *output_pointer = NULL; + double d = item->valuedouble; + int length = 0; + size_t i = 0; + unsigned char number_buffer[26]; /* temporary buffer to print the number into */ + unsigned char decimal_point = get_decimal_point(); + double test; + + if (output_buffer == NULL) + { + return false; + } + + /* This checks for NaN and Infinity */ + if ((d * 0) != 0) + { + length = sprintf((char*)number_buffer, "null"); + } + else + { + /* Try 15 decimal places of precision to avoid nonsignificant nonzero digits */ + length = sprintf((char*)number_buffer, "%1.15g", d); + + /* Check whether the original double can be recovered */ + if ((sscanf((char*)number_buffer, "%lg", &test) != 1) || ((double)test != d)) + { + /* If not, print with 17 decimal places of precision */ + length = sprintf((char*)number_buffer, "%1.17g", d); + } + } + + /* sprintf failed or buffer overrun occured */ + if ((length < 0) || (length >(int)(sizeof(number_buffer) - 1))) + { + return false; + } + + /* reserve appropriate space in the output */ + output_pointer = ensure(output_buffer, (size_t)length + sizeof("")); + if (output_pointer == NULL) + { + return false; + } + + /* copy the printed number to the output and replace locale + * dependent decimal point with '.' */ + for (i = 0; i < ((size_t)length); i++) + { + if (number_buffer[i] == decimal_point) + { + output_pointer[i] = '.'; + continue; + } + + output_pointer[i] = number_buffer[i]; + } + output_pointer[i] = '\0'; + + output_buffer->offset += (size_t)length; + + return true; } -static int update(printbuffer *p) +/* parse 4 digit hexadecimal number */ +static unsigned parse_hex4(const unsigned char * const input) { - char *str; - if (!p || !p->buffer) return 0; - str=p->buffer+p->offset; - return p->offset+strlen(str); + unsigned int h = 0; + size_t i = 0; + + for (i = 0; i < 4; i++) + { + /* parse digit */ + if ((input[i] >= '0') && (input[i] <= '9')) + { + h += (unsigned int)input[i] - '0'; + } + else if ((input[i] >= 'A') && (input[i] <= 'F')) + { + h += (unsigned int)10 + input[i] - 'A'; + } + else if ((input[i] >= 'a') && (input[i] <= 'f')) + { + h += (unsigned int)10 + input[i] - 'a'; + } + else /* invalid */ + { + return 0; + } + + if (i < 3) + { + /* shift left to make place for the next nibble */ + h = h << 4; + } + } + + return h; } -/* Render the number nicely from the given item into a string. */ -static char *print_number(cJSON *item,printbuffer *p) -{ - char *str=0; - double d=item->valuedouble; - if (d==0) - { - if (p) str=ensure(p,2); - else str=(char*)cJSON_malloc(2); /* special case for 0. */ - if (str) strcpy(str,"0"); - } - else if (fabs(((double)item->valueint)-d)<=DBL_EPSILON && d<=INT_MAX && d>=INT_MIN) - { - if (p) str=ensure(p,21); - else str=(char*)cJSON_malloc(21); /* 2^64+1 can be represented in 21 chars. */ - if (str) sprintf(str,"%d",item->valueint); - } - else - { - if (p) str=ensure(p,64); - else str=(char*)cJSON_malloc(64); /* This is a nice tradeoff. */ - if (str) - { - if (fabs(floor(d)-d)<=DBL_EPSILON && fabs(d)<1.0e60)sprintf(str,"%.0f",d); - else if (fabs(d)<1.0e-6 || fabs(d)>1.0e9) sprintf(str,"%e",d); - else sprintf(str,"%f",d); - } - } - return str; -} - -static unsigned parse_hex4(const char *str) -{ - unsigned h=0; - if (*str>='0' && *str<='9') h+=(*str)-'0'; else if (*str>='A' && *str<='F') h+=10+(*str)-'A'; else if (*str>='a' && *str<='f') h+=10+(*str)-'a'; else return 0; - h=h<<4;str++; - if (*str>='0' && *str<='9') h+=(*str)-'0'; else if (*str>='A' && *str<='F') h+=10+(*str)-'A'; else if (*str>='a' && *str<='f') h+=10+(*str)-'a'; else return 0; - h=h<<4;str++; - if (*str>='0' && *str<='9') h+=(*str)-'0'; else if (*str>='A' && *str<='F') h+=10+(*str)-'A'; else if (*str>='a' && *str<='f') h+=10+(*str)-'a'; else return 0; - h=h<<4;str++; - if (*str>='0' && *str<='9') h+=(*str)-'0'; else if (*str>='A' && *str<='F') h+=10+(*str)-'A'; else if (*str>='a' && *str<='f') h+=10+(*str)-'a'; else return 0; - return h; -} - -/* Parse the input text into an unescaped cstring, and populate item. */ -static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; -static const char *parse_string(cJSON *item,const char *str) -{ - const char *ptr=str+1;char *ptr2;char *out;int len=0;unsigned uc,uc2; - if (*str!='\"') {ep=str;return 0;} /* not a string! */ - - while (*ptr!='\"' && *ptr && ++len) if (*ptr++ == '\\') ptr++; /* Skip escaped quotes. */ - - out=(char*)cJSON_malloc(len+1); /* This is how long we need for the string, roughly. */ - if (!out) return 0; - - ptr=str+1;ptr2=out; - while (*ptr!='\"' && *ptr) - { - if (*ptr!='\\') *ptr2++=*ptr++; - else - { - ptr++; - switch (*ptr) - { - case 'b': *ptr2++='\b'; break; - case 'f': *ptr2++='\f'; break; - case 'n': *ptr2++='\n'; break; - case 'r': *ptr2++='\r'; break; - case 't': *ptr2++='\t'; break; - case 'u': /* transcode utf16 to utf8. */ - uc=parse_hex4(ptr+1);ptr+=4; /* get the unicode char. */ - - if ((uc>=0xDC00 && uc<=0xDFFF) || uc==0) break; /* check for invalid. */ - - if (uc>=0xD800 && uc<=0xDBFF) /* UTF16 surrogate pairs. */ - { - if (ptr[1]!='\\' || ptr[2]!='u') break; /* missing second-half of surrogate. */ - uc2=parse_hex4(ptr+3);ptr+=6; - if (uc2<0xDC00 || uc2>0xDFFF) break; /* invalid second-half of surrogate. */ - uc=0x10000 + (((uc&0x3FF)<<10) | (uc2&0x3FF)); - } - - len=4;if (uc<0x80) len=1;else if (uc<0x800) len=2;else if (uc<0x10000) len=3; ptr2+=len; - - switch (len) { - case 4: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6; - case 3: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6; - case 2: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6; - case 1: *--ptr2 =(uc | firstByteMark[len]); - } - ptr2+=len; - break; - default: *ptr2++=*ptr; break; - } - ptr++; - } - } - *ptr2=0; - if (*ptr=='\"') ptr++; - item->valuestring=out; - item->type=cJSON_String; - return ptr; +/* converts a UTF-16 literal to UTF-8 +* A literal can be one or two sequences of the form \uXXXX */ +static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer) +{ + long unsigned int codepoint = 0; + unsigned int first_code = 0; + const unsigned char *first_sequence = input_pointer; + unsigned char utf8_length = 0; + unsigned char utf8_position = 0; + unsigned char sequence_length = 0; + unsigned char first_byte_mark = 0; + + if ((input_end - first_sequence) < 6) + { + /* input ends unexpectedly */ + goto fail; + } + + /* get the first utf16 sequence */ + first_code = parse_hex4(first_sequence + 2); + + /* check that the code is valid */ + if (((first_code >= 0xDC00) && (first_code <= 0xDFFF))) + { + goto fail; + } + + /* UTF16 surrogate pair */ + if ((first_code >= 0xD800) && (first_code <= 0xDBFF)) + { + const unsigned char *second_sequence = first_sequence + 6; + unsigned int second_code = 0; + sequence_length = 12; /* \uXXXX\uXXXX */ + + if ((input_end - second_sequence) < 6) + { + /* input ends unexpectedly */ + goto fail; + } + + if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u')) + { + /* missing second half of the surrogate pair */ + goto fail; + } + + /* get the second utf16 sequence */ + second_code = parse_hex4(second_sequence + 2); + /* check that the code is valid */ + if ((second_code < 0xDC00) || (second_code > 0xDFFF)) + { + /* invalid second half of the surrogate pair */ + goto fail; + } + + + /* calculate the unicode codepoint from the surrogate pair */ + codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF)); + } + else + { + sequence_length = 6; /* \uXXXX */ + codepoint = first_code; + } + + /* encode as UTF-8 + * takes at maximum 4 bytes to encode: + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (codepoint < 0x80) + { + /* normal ascii, encoding 0xxxxxxx */ + utf8_length = 1; + } + else if (codepoint < 0x800) + { + /* two bytes, encoding 110xxxxx 10xxxxxx */ + utf8_length = 2; + first_byte_mark = 0xC0; /* 11000000 */ + } + else if (codepoint < 0x10000) + { + /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */ + utf8_length = 3; + first_byte_mark = 0xE0; /* 11100000 */ + } + else if (codepoint <= 0x10FFFF) + { + /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + utf8_length = 4; + first_byte_mark = 0xF0; /* 11110000 */ + } + else + { + /* invalid unicode codepoint */ + goto fail; + } + + /* encode as utf8 */ + for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--) + { + /* 10xxxxxx */ + (*output_pointer)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF); + codepoint >>= 6; + } + /* encode first byte */ + if (utf8_length > 1) + { + (*output_pointer)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF); + } + else + { + (*output_pointer)[0] = (unsigned char)(codepoint & 0x7F); + } + + *output_pointer += utf8_length; + + return sequence_length; + +fail: + return 0; +} + +/* Parse the input text into an unescaped cinput, and populate item. */ +static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_buffer) +{ + const unsigned char *input_pointer = buffer_at_offset(input_buffer) + 1; + const unsigned char *input_end = buffer_at_offset(input_buffer) + 1; + unsigned char *output_pointer = NULL; + unsigned char *output = NULL; + + /* not a string */ + if (buffer_at_offset(input_buffer)[0] != '\"') + { + goto fail; + } + + { + /* calculate approximate size of the output (overestimate) */ + size_t allocation_length = 0; + size_t skipped_bytes = 0; + while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"')) + { + /* is escape sequence */ + if (input_end[0] == '\\') + { + if ((size_t)(input_end + 1 - input_buffer->content) >= input_buffer->length) + { + /* prevent buffer overflow when last input character is a backslash */ + goto fail; + } + skipped_bytes++; + input_end++; + } + input_end++; + } + if (((size_t)(input_end - input_buffer->content) >= input_buffer->length) || (*input_end != '\"')) + { + goto fail; /* string ended unexpectedly */ + } + + /* This is at most how much we need for the output */ + allocation_length = (size_t)(input_end - buffer_at_offset(input_buffer)) - skipped_bytes; + output = (unsigned char*)input_buffer->hooks.allocate(allocation_length + sizeof("")); + if (output == NULL) + { + goto fail; /* allocation failure */ + } + } + + output_pointer = output; + /* loop through the string literal */ + while (input_pointer < input_end) + { + if (*input_pointer != '\\') + { + *output_pointer++ = *input_pointer++; + } + /* escape sequence */ + else + { + unsigned char sequence_length = 2; + if ((input_end - input_pointer) < 1) + { + goto fail; + } + + switch (input_pointer[1]) + { + case 'b': + *output_pointer++ = '\b'; + break; + case 'f': + *output_pointer++ = '\f'; + break; + case 'n': + *output_pointer++ = '\n'; + break; + case 'r': + *output_pointer++ = '\r'; + break; + case 't': + *output_pointer++ = '\t'; + break; + case '\"': + case '\\': + case '/': + *output_pointer++ = input_pointer[1]; + break; + + /* UTF-16 literal */ + case 'u': + sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer); + if (sequence_length == 0) + { + /* failed to convert UTF16-literal to UTF-8 */ + goto fail; + } + break; + + default: + goto fail; + } + input_pointer += sequence_length; + } + } + + /* zero terminate the output */ + *output_pointer = '\0'; + + item->type = cJSON_String; + item->valuestring = (char*)output; + + input_buffer->offset = (size_t)(input_end - input_buffer->content); + input_buffer->offset++; + + return true; + +fail: + if (output != NULL) + { + input_buffer->hooks.deallocate(output); + } + + if (input_pointer != NULL) + { + input_buffer->offset = (size_t)(input_pointer - input_buffer->content); + } + + return false; } /* Render the cstring provided to an escaped version that can be printed. */ -static char *print_string_ptr(const char *str,printbuffer *p) -{ - const char *ptr;char *ptr2,*out;int len=0,flag=0;unsigned char token; - - for (ptr=str;*ptr;ptr++) flag|=((*ptr>0 && *ptr<32)||(*ptr=='\"')||(*ptr=='\\'))?1:0; - if (!flag) - { - len=ptr-str; - if (p) out=ensure(p,len+3); - else out=(char*)cJSON_malloc(len+3); - if (!out) return 0; - ptr2=out;*ptr2++='\"'; - strcpy(ptr2,str); - ptr2[len]='\"'; - ptr2[len+1]=0; - return out; - } - - if (!str) - { - if (p) out=ensure(p,3); - else out=(char*)cJSON_malloc(3); - if (!out) return 0; - strcpy(out,"\"\""); - return out; - } - ptr=str;while ((token=*ptr) && ++len) {if (strchr("\"\\\b\f\n\r\t",token)) len++; else if (token<32) len+=5;ptr++;} - - if (p) out=ensure(p,len+3); - else out=(char*)cJSON_malloc(len+3); - if (!out) return 0; - - ptr2=out;ptr=str; - *ptr2++='\"'; - while (*ptr) - { - if ((unsigned char)*ptr>31 && *ptr!='\"' && *ptr!='\\') *ptr2++=*ptr++; - else - { - *ptr2++='\\'; - switch (token=*ptr++) - { - case '\\': *ptr2++='\\'; break; - case '\"': *ptr2++='\"'; break; - case '\b': *ptr2++='b'; break; - case '\f': *ptr2++='f'; break; - case '\n': *ptr2++='n'; break; - case '\r': *ptr2++='r'; break; - case '\t': *ptr2++='t'; break; - default: sprintf(ptr2,"u%04x",token);ptr2+=5; break; /* escape and print */ - } - } - } - *ptr2++='\"';*ptr2++=0; - return out; -} -/* Invote print_string_ptr (which is useful) on an item. */ -static char *print_string(cJSON *item,printbuffer *p) {return print_string_ptr(item->valuestring,p);} +static cJSON_bool print_string_ptr(const unsigned char * const input, printbuffer * const output_buffer) +{ + const unsigned char *input_pointer = NULL; + unsigned char *output = NULL; + unsigned char *output_pointer = NULL; + size_t output_length = 0; + /* numbers of additional characters needed for escaping */ + size_t escape_characters = 0; + + if (output_buffer == NULL) + { + return false; + } + + /* empty string */ + if (input == NULL) + { + output = ensure(output_buffer, sizeof("\"\"")); + if (output == NULL) + { + return false; + } + strcpy((char*)output, "\"\""); + + return true; + } + + /* set "flag" to 1 if something needs to be escaped */ + for (input_pointer = input; *input_pointer; input_pointer++) + { + switch (*input_pointer) + { + case '\"': + case '\\': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + /* one character escape sequence */ + escape_characters++; + break; + default: + if (*input_pointer < 32) + { + /* UTF-16 escape sequence uXXXX */ + escape_characters += 5; + } + break; + } + } + output_length = (size_t)(input_pointer - input) + escape_characters; + + output = ensure(output_buffer, output_length + sizeof("\"\"")); + if (output == NULL) + { + return false; + } + + /* no characters have to be escaped */ + if (escape_characters == 0) + { + output[0] = '\"'; + memcpy(output + 1, input, output_length); + output[output_length + 1] = '\"'; + output[output_length + 2] = '\0'; + + return true; + } + + output[0] = '\"'; + output_pointer = output + 1; + /* copy the string */ + for (input_pointer = input; *input_pointer != '\0'; (void)input_pointer++, output_pointer++) + { + if ((*input_pointer > 31) && (*input_pointer != '\"') && (*input_pointer != '\\')) + { + /* normal character, copy */ + *output_pointer = *input_pointer; + } + else + { + /* character needs to be escaped */ + *output_pointer++ = '\\'; + switch (*input_pointer) + { + case '\\': + *output_pointer = '\\'; + break; + case '\"': + *output_pointer = '\"'; + break; + case '\b': + *output_pointer = 'b'; + break; + case '\f': + *output_pointer = 'f'; + break; + case '\n': + *output_pointer = 'n'; + break; + case '\r': + *output_pointer = 'r'; + break; + case '\t': + *output_pointer = 't'; + break; + default: + /* escape and print as unicode codepoint */ + sprintf((char*)output_pointer, "u%04x", *input_pointer); + output_pointer += 4; + break; + } + } + } + output[output_length + 1] = '\"'; + output[output_length + 2] = '\0'; + + return true; +} + +/* Invoke print_string_ptr (which is useful) on an item. */ +static cJSON_bool print_string(const cJSON * const item, printbuffer * const p) +{ + return print_string_ptr((unsigned char*)item->valuestring, p); +} /* Predeclare these prototypes. */ -static const char *parse_value(cJSON *item,const char *value); -static char *print_value(cJSON *item,int depth,int fmt,printbuffer *p); -static const char *parse_array(cJSON *item,const char *value); -static char *print_array(cJSON *item,int depth,int fmt,printbuffer *p); -static const char *parse_object(cJSON *item,const char *value); -static char *print_object(cJSON *item,int depth,int fmt,printbuffer *p); +static cJSON_bool parse_value(cJSON * const item, parse_buffer * const input_buffer); +static cJSON_bool print_value(const cJSON * const item, printbuffer * const output_buffer); +static cJSON_bool parse_array(cJSON * const item, parse_buffer * const input_buffer); +static cJSON_bool print_array(const cJSON * const item, printbuffer * const output_buffer); +static cJSON_bool parse_object(cJSON * const item, parse_buffer * const input_buffer); +static cJSON_bool print_object(const cJSON * const item, printbuffer * const output_buffer); /* Utility to jump whitespace and cr/lf */ -static const char *skip(const char *in) {while (in && *in && (unsigned char)*in<=32) in++; return in;} +static parse_buffer *buffer_skip_whitespace(parse_buffer * const buffer) +{ + if ((buffer == NULL) || (buffer->content == NULL)) + { + return NULL; + } + + while (can_access_at_index(buffer, 0) && (buffer_at_offset(buffer)[0] <= 32)) + { + buffer->offset++; + } + + if (buffer->offset == buffer->length) + { + buffer->offset--; + } + + return buffer; +} -/* Parse an object - create a new root, and populate. */ -cJSON *cJSON_ParseWithOpts(const char *value,const char **return_parse_end,int require_null_terminated) +/* skip the UTF-8 BOM (byte order mark) if it is at the beginning of a buffer */ +static parse_buffer *skip_utf8_bom(parse_buffer * const buffer) { - const char *end=0; - cJSON *c=cJSON_New_Item(); - ep=0; - if (!c) return 0; /* memory fail */ + if ((buffer == NULL) || (buffer->content == NULL) || (buffer->offset != 0)) + { + return NULL; + } - end=parse_value(c,skip(value)); - if (!end) {cJSON_Delete(c);return 0;} /* parse failure. ep is set. */ + if (can_access_at_index(buffer, 4) && (strncmp((const char*)buffer_at_offset(buffer), "\xEF\xBB\xBF", 3) == 0)) + { + buffer->offset += 3; + } - /* if we require null-terminated JSON without appended garbage, skip and then check for a null terminator */ - if (require_null_terminated) {end=skip(end);if (*end) {cJSON_Delete(c);ep=end;return 0;}} - if (return_parse_end) *return_parse_end=end; - return c; + return buffer; } -/* Default options for cJSON_Parse */ -cJSON *cJSON_Parse(const char *value) {return cJSON_ParseWithOpts(value,0,0);} -/* Render a cJSON item/entity/structure to text. */ -char *cJSON_Print(cJSON *item) {return print_value(item,0,1,0);} -char *cJSON_PrintUnformatted(cJSON *item) {return print_value(item,0,0,0);} +/* Parse an object - create a new root, and populate. */ +CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated) +{ + parse_buffer buffer = { 0, 0, 0, 0,{ 0, 0, 0 } }; + cJSON *item = NULL; + + /* reset error position */ + global_error.json = NULL; + global_error.position = 0; + + if (value == NULL) + { + goto fail; + } + + buffer.content = (const unsigned char*)value; + buffer.length = strlen((const char*)value) + sizeof(""); + buffer.offset = 0; + buffer.hooks = global_hooks; + + item = cJSON_New_Item(&global_hooks); + if (item == NULL) /* memory fail */ + { + goto fail; + } + + if (!parse_value(item, buffer_skip_whitespace(skip_utf8_bom(&buffer)))) + { + /* parse failure. ep is set. */ + goto fail; + } + + /* if we require null-terminated JSON without appended garbage, skip and then check for a null terminator */ + if (require_null_terminated) + { + buffer_skip_whitespace(&buffer); + if ((buffer.offset >= buffer.length) || buffer_at_offset(&buffer)[0] != '\0') + { + goto fail; + } + } + if (return_parse_end) + { + *return_parse_end = (const char*)buffer_at_offset(&buffer); + } + + return item; + +fail: + if (item != NULL) + { + cJSON_Delete(item); + } + + if (value != NULL) + { + error local_error; + local_error.json = (const unsigned char*)value; + local_error.position = 0; + + if (buffer.offset < buffer.length) + { + local_error.position = buffer.offset; + } + else if (buffer.length > 0) + { + local_error.position = buffer.length - 1; + } + + if (return_parse_end != NULL) + { + *return_parse_end = (const char*)local_error.json + local_error.position; + } + + global_error = local_error; + } + + return NULL; +} -char *cJSON_PrintBuffered(cJSON *item,int prebuffer,int fmt) +/* Default options for cJSON_Parse */ +CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value) { - printbuffer p; - p.buffer=(char*)cJSON_malloc(prebuffer); - p.length=prebuffer; - p.offset=0; - return print_value(item,0,fmt,&p); - return p.buffer; + return cJSON_ParseWithOpts(value, 0, 0); } +#define cjson_min(a, b) ((a < b) ? a : b) -/* Parser core - when encountering text, process appropriately. */ -static const char *parse_value(cJSON *item,const char *value) +static unsigned char *print(const cJSON * const item, cJSON_bool format, const internal_hooks * const hooks) { - if (!value) return 0; /* Fail on null. */ - if (!strncmp(value,"null",4)) { item->type=cJSON_NULL; return value+4; } - if (!strncmp(value,"false",5)) { item->type=cJSON_False; return value+5; } - if (!strncmp(value,"true",4)) { item->type=cJSON_True; item->valueint=1; return value+4; } - if (*value=='\"') { return parse_string(item,value); } - if (*value=='-' || (*value>='0' && *value<='9')) { return parse_number(item,value); } - if (*value=='[') { return parse_array(item,value); } - if (*value=='{') { return parse_object(item,value); } + static const size_t default_buffer_size = 256; + printbuffer buffer[1]; + unsigned char *printed = NULL; + + memset(buffer, 0, sizeof(buffer)); + + /* create buffer */ + buffer->buffer = (unsigned char*)hooks->allocate(default_buffer_size); + buffer->length = default_buffer_size; + buffer->format = format; + buffer->hooks = *hooks; + if (buffer->buffer == NULL) + { + goto fail; + } + + /* print the value */ + if (!print_value(item, buffer)) + { + goto fail; + } + update_offset(buffer); + + /* check if reallocate is available */ + if (hooks->reallocate != NULL) + { + printed = (unsigned char*)hooks->reallocate(buffer->buffer, buffer->offset + 1); + if (printed == NULL) { + goto fail; + } + buffer->buffer = NULL; + } + else /* otherwise copy the JSON over to a new buffer */ + { + printed = (unsigned char*)hooks->allocate(buffer->offset + 1); + if (printed == NULL) + { + goto fail; + } + memcpy(printed, buffer->buffer, cjson_min(buffer->length, buffer->offset + 1)); + printed[buffer->offset] = '\0'; /* just to be sure */ + + /* free the buffer */ + hooks->deallocate(buffer->buffer); + } + + return printed; + +fail: + if (buffer->buffer != NULL) + { + hooks->deallocate(buffer->buffer); + } + + if (printed != NULL) + { + hooks->deallocate(printed); + } + + return NULL; +} - ep=value;return 0; /* failure. */ +/* Render a cJSON item/entity/structure to text. */ +CJSON_PUBLIC(char *) cJSON_Print(const cJSON *item) +{ + return (char*)print(item, true, &global_hooks); } -/* Render a value to text. */ -static char *print_value(cJSON *item,int depth,int fmt,printbuffer *p) -{ - char *out=0; - if (!item) return 0; - if (p) - { - switch ((item->type)&255) - { - case cJSON_NULL: {out=ensure(p,5); if (out) strcpy(out,"null"); break;} - case cJSON_False: {out=ensure(p,6); if (out) strcpy(out,"false"); break;} - case cJSON_True: {out=ensure(p,5); if (out) strcpy(out,"true"); break;} - case cJSON_Number: out=print_number(item,p);break; - case cJSON_String: out=print_string(item,p);break; - case cJSON_Array: out=print_array(item,depth,fmt,p);break; - case cJSON_Object: out=print_object(item,depth,fmt,p);break; - } - } - else - { - switch ((item->type)&255) - { - case cJSON_NULL: out=cJSON_strdup("null"); break; - case cJSON_False: out=cJSON_strdup("false");break; - case cJSON_True: out=cJSON_strdup("true"); break; - case cJSON_Number: out=print_number(item,0);break; - case cJSON_String: out=print_string(item,0);break; - case cJSON_Array: out=print_array(item,depth,fmt,0);break; - case cJSON_Object: out=print_object(item,depth,fmt,0);break; - } - } - return out; +CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item) +{ + return (char*)print(item, false, &global_hooks); } -/* Build an array from input text. */ -static const char *parse_array(cJSON *item,const char *value) +CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt) { - cJSON *child; - if (*value!='[') {ep=value;return 0;} /* not an array! */ + printbuffer p = { 0, 0, 0, 0, 0, 0,{ 0, 0, 0 } }; + + if (prebuffer < 0) + { + return NULL; + } + + p.buffer = (unsigned char*)global_hooks.allocate((size_t)prebuffer); + if (!p.buffer) + { + return NULL; + } + + p.length = (size_t)prebuffer; + p.offset = 0; + p.noalloc = false; + p.format = fmt; + p.hooks = global_hooks; + + if (!print_value(item, &p)) + { + global_hooks.deallocate(p.buffer); + return NULL; + } + + return (char*)p.buffer; +} - item->type=cJSON_Array; - value=skip(value+1); - if (*value==']') return value+1; /* empty array. */ +CJSON_PUBLIC(cJSON_bool) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len, const cJSON_bool fmt) +{ + printbuffer p = { 0, 0, 0, 0, 0, 0,{ 0, 0, 0 } }; - item->child=child=cJSON_New_Item(); - if (!item->child) return 0; /* memory fail */ - value=skip(parse_value(child,skip(value))); /* skip any spacing, get the value. */ - if (!value) return 0; + if ((len < 0) || (buf == NULL)) + { + return false; + } - while (*value==',') - { - cJSON *new_item; - if (!(new_item=cJSON_New_Item())) return 0; /* memory fail */ - child->next=new_item;new_item->prev=child;child=new_item; - value=skip(parse_value(child,skip(value+1))); - if (!value) return 0; /* memory fail */ - } + p.buffer = (unsigned char*)buf; + p.length = (size_t)len; + p.offset = 0; + p.noalloc = true; + p.format = fmt; + p.hooks = global_hooks; + + return print_value(item, &p); +} - if (*value==']') return value+1; /* end of array */ - ep=value;return 0; /* malformed. */ +/* Parser core - when encountering text, process appropriately. */ +static cJSON_bool parse_value(cJSON * const item, parse_buffer * const input_buffer) +{ + if ((input_buffer == NULL) || (input_buffer->content == NULL)) + { + return false; /* no input */ + } + + /* parse the different types of values */ + /* null */ + if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "null", 4) == 0)) + { + item->type = cJSON_NULL; + input_buffer->offset += 4; + return true; + } + /* false */ + if (can_read(input_buffer, 5) && (strncmp((const char*)buffer_at_offset(input_buffer), "false", 5) == 0)) + { + item->type = cJSON_False; + input_buffer->offset += 5; + return true; + } + /* true */ + if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "true", 4) == 0)) + { + item->type = cJSON_True; + item->valueint = 1; + input_buffer->offset += 4; + return true; + } + /* string */ + if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '\"')) + { + return parse_string(item, input_buffer); + } + /* number */ + if (can_access_at_index(input_buffer, 0) && ((buffer_at_offset(input_buffer)[0] == '-') || ((buffer_at_offset(input_buffer)[0] >= '0') && (buffer_at_offset(input_buffer)[0] <= '9')))) + { + return parse_number(item, input_buffer); + } + /* array */ + if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '[')) + { + return parse_array(item, input_buffer); + } + /* object */ + if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '{')) + { + return parse_object(item, input_buffer); + } + + return false; +} + +/* Render a value to text. */ +static cJSON_bool print_value(const cJSON * const item, printbuffer * const output_buffer) +{ + unsigned char *output = NULL; + + if ((item == NULL) || (output_buffer == NULL)) + { + return false; + } + + switch ((item->type) & 0xFF) + { + case cJSON_NULL: + output = ensure(output_buffer, 5); + if (output == NULL) + { + return false; + } + strcpy((char*)output, "null"); + return true; + + case cJSON_False: + output = ensure(output_buffer, 6); + if (output == NULL) + { + return false; + } + strcpy((char*)output, "false"); + return true; + + case cJSON_True: + output = ensure(output_buffer, 5); + if (output == NULL) + { + return false; + } + strcpy((char*)output, "true"); + return true; + + case cJSON_Number: + return print_number(item, output_buffer); + + case cJSON_Raw: + { + size_t raw_length = 0; + if (item->valuestring == NULL) + { + return false; + } + + raw_length = strlen(item->valuestring) + sizeof(""); + output = ensure(output_buffer, raw_length); + if (output == NULL) + { + return false; + } + memcpy(output, item->valuestring, raw_length); + return true; + } + + case cJSON_String: + return print_string(item, output_buffer); + + case cJSON_Array: + return print_array(item, output_buffer); + + case cJSON_Object: + return print_object(item, output_buffer); + + default: + return false; + } +} + +/* Build an array from input text. */ +static cJSON_bool parse_array(cJSON * const item, parse_buffer * const input_buffer) +{ + cJSON *head = NULL; /* head of the linked list */ + cJSON *current_item = NULL; + + if (input_buffer->depth >= CJSON_NESTING_LIMIT) + { + return false; /* to deeply nested */ + } + input_buffer->depth++; + + if (buffer_at_offset(input_buffer)[0] != '[') + { + /* not an array */ + goto fail; + } + + input_buffer->offset++; + buffer_skip_whitespace(input_buffer); + if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ']')) + { + /* empty array */ + goto success; + } + + /* check if we skipped to the end of the buffer */ + if (cannot_access_at_index(input_buffer, 0)) + { + input_buffer->offset--; + goto fail; + } + + /* step back to character in front of the first element */ + input_buffer->offset--; + /* loop through the comma separated array elements */ + do + { + /* allocate next item */ + cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks)); + if (new_item == NULL) + { + goto fail; /* allocation failure */ + } + + /* attach next item to list */ + if (head == NULL) + { + /* start the linked list */ + current_item = head = new_item; + } + else + { + /* add to the end and advance */ + current_item->next = new_item; + new_item->prev = current_item; + current_item = new_item; + } + + /* parse next value */ + input_buffer->offset++; + buffer_skip_whitespace(input_buffer); + if (!parse_value(current_item, input_buffer)) + { + goto fail; /* failed to parse value */ + } + buffer_skip_whitespace(input_buffer); + } while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ',')); + + if (cannot_access_at_index(input_buffer, 0) || buffer_at_offset(input_buffer)[0] != ']') + { + goto fail; /* expected end of array */ + } + +success: + input_buffer->depth--; + + item->type = cJSON_Array; + item->child = head; + + input_buffer->offset++; + + return true; + +fail: + if (head != NULL) + { + cJSON_Delete(head); + } + + return false; } /* Render an array to text */ -static char *print_array(cJSON *item,int depth,int fmt,printbuffer *p) -{ - char **entries; - char *out=0,*ptr,*ret;int len=5; - cJSON *child=item->child; - int numentries=0,i=0,fail=0; - size_t tmplen=0; - - /* How many entries in the array? */ - while (child) numentries++,child=child->next; - /* Explicitly handle numentries==0 */ - if (!numentries) - { - if (p) out=ensure(p,3); - else out=(char*)cJSON_malloc(3); - if (out) strcpy(out,"[]"); - return out; - } - - if (p) - { - /* Compose the output array. */ - i=p->offset; - ptr=ensure(p,1);if (!ptr) return 0; *ptr='['; p->offset++; - child=item->child; - while (child && !fail) - { - print_value(child,depth+1,fmt,p); - p->offset=update(p); - if (child->next) {len=fmt?2:1;ptr=ensure(p,len+1);if (!ptr) return 0;*ptr++=',';if(fmt)*ptr++=' ';*ptr=0;p->offset+=len;} - child=child->next; - } - ptr=ensure(p,2);if (!ptr) return 0; *ptr++=']';*ptr=0; - out=(p->buffer)+i; - } - else - { - /* Allocate an array to hold the values for each */ - entries=(char**)cJSON_malloc(numentries*sizeof(char*)); - if (!entries) return 0; - memset(entries,0,numentries*sizeof(char*)); - /* Retrieve all the results: */ - child=item->child; - while (child && !fail) - { - ret=print_value(child,depth+1,fmt,0); - entries[i++]=ret; - if (ret) len+=strlen(ret)+2+(fmt?1:0); else fail=1; - child=child->next; - } - - /* If we didn't fail, try to malloc the output string */ - if (!fail) out=(char*)cJSON_malloc(len); - /* If that fails, we fail. */ - if (!out) fail=1; - - /* Handle failure. */ - if (fail) - { - for (i=0;ichild; + + if (output_buffer == NULL) + { + return false; + } + + /* Compose the output array. */ + /* opening square bracket */ + output_pointer = ensure(output_buffer, 1); + if (output_pointer == NULL) + { + return false; + } + + *output_pointer = '['; + output_buffer->offset++; + output_buffer->depth++; + + while (current_element != NULL) + { + if (!print_value(current_element, output_buffer)) + { + return false; + } + update_offset(output_buffer); + if (current_element->next) + { + length = (size_t)(output_buffer->format ? 2 : 1); + output_pointer = ensure(output_buffer, length + 1); + if (output_pointer == NULL) + { + return false; + } + *output_pointer++ = ','; + if (output_buffer->format) + { + *output_pointer++ = ' '; + } + *output_pointer = '\0'; + output_buffer->offset += length; + } + current_element = current_element->next; + } + + output_pointer = ensure(output_buffer, 2); + if (output_pointer == NULL) + { + return false; + } + *output_pointer++ = ']'; + *output_pointer = '\0'; + output_buffer->depth--; + + return true; } /* Build an object from the text. */ -static const char *parse_object(cJSON *item,const char *value) -{ - cJSON *child; - if (*value!='{') {ep=value;return 0;} /* not an object! */ - - item->type=cJSON_Object; - value=skip(value+1); - if (*value=='}') return value+1; /* empty array. */ - - item->child=child=cJSON_New_Item(); - if (!item->child) return 0; - value=skip(parse_string(child,skip(value))); - if (!value) return 0; - child->string=child->valuestring;child->valuestring=0; - if (*value!=':') {ep=value;return 0;} /* fail! */ - value=skip(parse_value(child,skip(value+1))); /* skip any spacing, get the value. */ - if (!value) return 0; - - while (*value==',') - { - cJSON *new_item; - if (!(new_item=cJSON_New_Item())) return 0; /* memory fail */ - child->next=new_item;new_item->prev=child;child=new_item; - value=skip(parse_string(child,skip(value+1))); - if (!value) return 0; - child->string=child->valuestring;child->valuestring=0; - if (*value!=':') {ep=value;return 0;} /* fail! */ - value=skip(parse_value(child,skip(value+1))); /* skip any spacing, get the value. */ - if (!value) return 0; - } - - if (*value=='}') return value+1; /* end of array */ - ep=value;return 0; /* malformed. */ +static cJSON_bool parse_object(cJSON * const item, parse_buffer * const input_buffer) +{ + cJSON *head = NULL; /* linked list head */ + cJSON *current_item = NULL; + + if (input_buffer->depth >= CJSON_NESTING_LIMIT) + { + return false; /* to deeply nested */ + } + input_buffer->depth++; + + if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '{')) + { + goto fail; /* not an object */ + } + + input_buffer->offset++; + buffer_skip_whitespace(input_buffer); + if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '}')) + { + goto success; /* empty object */ + } + + /* check if we skipped to the end of the buffer */ + if (cannot_access_at_index(input_buffer, 0)) + { + input_buffer->offset--; + goto fail; + } + + /* step back to character in front of the first element */ + input_buffer->offset--; + /* loop through the comma separated array elements */ + do + { + /* allocate next item */ + cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks)); + if (new_item == NULL) + { + goto fail; /* allocation failure */ + } + + /* attach next item to list */ + if (head == NULL) + { + /* start the linked list */ + current_item = head = new_item; + } + else + { + /* add to the end and advance */ + current_item->next = new_item; + new_item->prev = current_item; + current_item = new_item; + } + + /* parse the name of the child */ + input_buffer->offset++; + buffer_skip_whitespace(input_buffer); + if (!parse_string(current_item, input_buffer)) + { + goto fail; /* faile to parse name */ + } + buffer_skip_whitespace(input_buffer); + + /* swap valuestring and string, because we parsed the name */ + current_item->string = current_item->valuestring; + current_item->valuestring = NULL; + + if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != ':')) + { + goto fail; /* invalid object */ + } + + /* parse the value */ + input_buffer->offset++; + buffer_skip_whitespace(input_buffer); + if (!parse_value(current_item, input_buffer)) + { + goto fail; /* failed to parse value */ + } + buffer_skip_whitespace(input_buffer); + } while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ',')); + + if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '}')) + { + goto fail; /* expected end of object */ + } + +success: + input_buffer->depth--; + + item->type = cJSON_Object; + item->child = head; + + input_buffer->offset++; + return true; + +fail: + if (head != NULL) + { + cJSON_Delete(head); + } + + return false; } /* Render an object to text. */ -static char *print_object(cJSON *item,int depth,int fmt,printbuffer *p) -{ - char **entries=0,**names=0; - char *out=0,*ptr,*ret,*str;int len=7,i=0,j; - cJSON *child=item->child; - int numentries=0,fail=0; - size_t tmplen=0; - /* Count the number of entries. */ - while (child) numentries++,child=child->next; - /* Explicitly handle empty object case */ - if (!numentries) - { - if (p) out=ensure(p,fmt?depth+4:3); - else out=(char*)cJSON_malloc(fmt?depth+4:3); - if (!out) return 0; - ptr=out;*ptr++='{'; - if (fmt) {*ptr++='\n';for (i=0;ioffset; - len=fmt?2:1; ptr=ensure(p,len+1); if (!ptr) return 0; - *ptr++='{'; if (fmt) *ptr++='\n'; *ptr=0; p->offset+=len; - child=item->child;depth++; - while (child) - { - if (fmt) - { - ptr=ensure(p,depth); if (!ptr) return 0; - for (j=0;joffset+=depth; - } - print_string_ptr(child->string,p); - p->offset=update(p); - - len=fmt?2:1; - ptr=ensure(p,len); if (!ptr) return 0; - *ptr++=':';if (fmt) *ptr++='\t'; - p->offset+=len; - - print_value(child,depth,fmt,p); - p->offset=update(p); - - len=(fmt?1:0)+(child->next?1:0); - ptr=ensure(p,len+1); if (!ptr) return 0; - if (child->next) *ptr++=','; - if (fmt) *ptr++='\n';*ptr=0; - p->offset+=len; - child=child->next; - } - ptr=ensure(p,fmt?(depth+1):2); if (!ptr) return 0; - if (fmt) for (i=0;ibuffer)+i; - } - else - { - /* Allocate space for the names and the objects */ - entries=(char**)cJSON_malloc(numentries*sizeof(char*)); - if (!entries) return 0; - names=(char**)cJSON_malloc(numentries*sizeof(char*)); - if (!names) {cJSON_free(entries);return 0;} - memset(entries,0,sizeof(char*)*numentries); - memset(names,0,sizeof(char*)*numentries); - - /* Collect all the results into our arrays: */ - child=item->child;depth++;if (fmt) len+=depth; - while (child) - { - names[i]=str=print_string_ptr(child->string,0); - entries[i++]=ret=print_value(child,depth,fmt,0); - if (str && ret) len+=strlen(ret)+strlen(str)+2+(fmt?2+depth:0); else fail=1; - child=child->next; - } - - /* Try to allocate the output string */ - if (!fail) out=(char*)cJSON_malloc(len); - if (!out) fail=1; - - /* Handle failure */ - if (fail) - { - for (i=0;ichild; + + if (output_buffer == NULL) + { + return false; + } + + /* Compose the output: */ + length = (size_t)(output_buffer->format ? 2 : 1); /* fmt: {\n */ + output_pointer = ensure(output_buffer, length + 1); + if (output_pointer == NULL) + { + return false; + } + + *output_pointer++ = '{'; + output_buffer->depth++; + if (output_buffer->format) + { + *output_pointer++ = '\n'; + } + output_buffer->offset += length; + + while (current_item) + { + if (output_buffer->format) + { + size_t i; + output_pointer = ensure(output_buffer, output_buffer->depth); + if (output_pointer == NULL) + { + return false; + } + for (i = 0; i < output_buffer->depth; i++) + { + *output_pointer++ = '\t'; + } + output_buffer->offset += output_buffer->depth; + } + + /* print key */ + if (!print_string_ptr((unsigned char*)current_item->string, output_buffer)) + { + return false; + } + update_offset(output_buffer); + + length = (size_t)(output_buffer->format ? 2 : 1); + output_pointer = ensure(output_buffer, length); + if (output_pointer == NULL) + { + return false; + } + *output_pointer++ = ':'; + if (output_buffer->format) + { + *output_pointer++ = '\t'; + } + output_buffer->offset += length; + + /* print value */ + if (!print_value(current_item, output_buffer)) + { + return false; + } + update_offset(output_buffer); + + /* print comma if not last */ + length = (size_t)((output_buffer->format ? 1 : 0) + (current_item->next ? 1 : 0)); + output_pointer = ensure(output_buffer, length + 1); + if (output_pointer == NULL) + { + return false; + } + if (current_item->next) + { + *output_pointer++ = ','; + } + + if (output_buffer->format) + { + *output_pointer++ = '\n'; + } + *output_pointer = '\0'; + output_buffer->offset += length; + + current_item = current_item->next; + } + + output_pointer = ensure(output_buffer, output_buffer->format ? (output_buffer->depth + 1) : 2); + if (output_pointer == NULL) + { + return false; + } + if (output_buffer->format) + { + size_t i; + for (i = 0; i < (output_buffer->depth - 1); i++) + { + *output_pointer++ = '\t'; + } + } + *output_pointer++ = '}'; + *output_pointer = '\0'; + output_buffer->depth--; + + return true; } /* Get Array size/item / object item. */ -int cJSON_GetArraySize(cJSON *array) {cJSON *c=array->child;int i=0;while(c)i++,c=c->next;return i;} -cJSON *cJSON_GetArrayItem(cJSON *array,int item) +CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array) +{ + cJSON *child = NULL; + size_t size = 0; + + if (array == NULL) + { + return 0; + } + + child = array->child; + + while (child != NULL) + { + size++; + child = child->next; + } + + /* FIXME: Can overflow here. Cannot be fixed without breaking the API */ + + return (int)size; +} + +static cJSON* get_array_item(const cJSON *array, size_t index) +{ + cJSON *current_child = NULL; + + if (array == NULL) + { + return NULL; + } + + current_child = array->child; + while ((current_child != NULL) && (index > 0)) + { + index--; + current_child = current_child->next; + } + + return current_child; +} + +CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index) { - cJSON *c = (array != NULL) ? array->child : NULL; - while ((c != NULL) && (item > 0)) - { - item--; - c = c->next; - } + if (index < 0) + { + return NULL; + } - return c; + return get_array_item(array, (size_t)index); } -cJSON *cJSON_GetObjectItem(cJSON *object, const char *string) +static cJSON *get_object_item(const cJSON * const object, const char * const name, const cJSON_bool case_sensitive) { - cJSON *c = (object != NULL) ? object->child : NULL; - while ((c != NULL) && (cJSON_strcasecmp(c->string, string))) - { - c = c->next; - } - return c; + cJSON *current_element = NULL; + + if ((object == NULL) || (name == NULL)) + { + return NULL; + } + + current_element = object->child; + if (case_sensitive) + { + while ((current_element != NULL) && (strcmp(name, current_element->string) != 0)) + { + current_element = current_element->next; + } + } + else + { + while ((current_element != NULL) && (case_insensitive_strcmp((const unsigned char*)name, (const unsigned char*)(current_element->string)) != 0)) + { + current_element = current_element->next; + } + } + + return current_element; +} + +CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON * const object, const char * const string) +{ + return get_object_item(object, string, false); +} + +CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string) +{ + return get_object_item(object, string, true); +} + +CJSON_PUBLIC(cJSON_bool) cJSON_HasObjectItem(const cJSON *object, const char *string) +{ + return cJSON_GetObjectItem(object, string) ? 1 : 0; } /* Utility for array list handling. */ -static void suffix_object(cJSON *prev,cJSON *item) {prev->next=item;item->prev=prev;} +static void suffix_object(cJSON *prev, cJSON *item) +{ + prev->next = item; + item->prev = prev; +} + /* Utility for handling references. */ -static cJSON *create_reference(cJSON *item) {cJSON *ref=cJSON_New_Item();if (!ref) return 0;memcpy(ref,item,sizeof(cJSON));ref->string=0;ref->type|=cJSON_IsReference;ref->next=ref->prev=0;return ref;} +static cJSON *create_reference(const cJSON *item, const internal_hooks * const hooks) +{ + cJSON *reference = NULL; + if (item == NULL) + { + return NULL; + } + + reference = cJSON_New_Item(hooks); + if (reference == NULL) + { + return NULL; + } + + memcpy(reference, item, sizeof(cJSON)); + reference->string = NULL; + reference->type |= cJSON_IsReference; + reference->next = reference->prev = NULL; + return reference; +} + +static cJSON_bool add_item_to_array(cJSON *array, cJSON *item) +{ + cJSON *child = NULL; + + if ((item == NULL) || (array == NULL)) + { + return false; + } + + child = array->child; + + if (child == NULL) + { + /* list is empty, start new one */ + array->child = item; + } + else + { + /* append to the end */ + while (child->next) + { + child = child->next; + } + suffix_object(child, item); + } + + return true; +} /* Add item to array/object. */ -void cJSON_AddItemToArray(cJSON *array, cJSON *item) {cJSON *c=array->child;if (!item) return; if (!c) {array->child=item;} else {while (c && c->next) c=c->next; suffix_object(c,item);}} -void cJSON_AddItemToObject(cJSON *object,const char *string,cJSON *item) {if (!item) return; if (item->string) cJSON_free(item->string);item->string=cJSON_strdup(string);cJSON_AddItemToArray(object,item);} -void cJSON_AddItemToObjectCS(cJSON *object,const char *string,cJSON *item) {if (!item) return; if (!(item->type&cJSON_StringIsConst) && item->string) cJSON_free(item->string);item->string=(char*)string;item->type|=cJSON_StringIsConst;cJSON_AddItemToArray(object,item);} -void cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item) {cJSON_AddItemToArray(array,create_reference(item));} -void cJSON_AddItemReferenceToObject(cJSON *object,const char *string,cJSON *item) {cJSON_AddItemToObject(object,string,create_reference(item));} - -cJSON *cJSON_DetachItemFromArray(cJSON *array,int which) {cJSON *c=array->child;while (c && which>0) c=c->next,which--;if (!c) return 0; - if (c->prev) c->prev->next=c->next;if (c->next) c->next->prev=c->prev;if (c==array->child) array->child=c->next;c->prev=c->next=0;return c;} -void cJSON_DeleteItemFromArray(cJSON *array,int which) {cJSON_Delete(cJSON_DetachItemFromArray(array,which));} -cJSON *cJSON_DetachItemFromObject(cJSON *object,const char *string) {int i=0;cJSON *c=object->child;while (c && cJSON_strcasecmp(c->string,string)) i++,c=c->next;if (c) return cJSON_DetachItemFromArray(object,i);return 0;} -void cJSON_DeleteItemFromObject(cJSON *object,const char *string) {cJSON_Delete(cJSON_DetachItemFromObject(object,string));} +CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item) +{ + add_item_to_array(array, item); +} + +#if defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5)))) +#pragma GCC diagnostic push +#endif +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif +/* helper function to cast away const */ +static void* cast_away_const(const void* string) +{ + return (void*)string; +} +#if defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5)))) +#pragma GCC diagnostic pop +#endif + + +static cJSON_bool add_item_to_object(cJSON * const object, const char * const string, cJSON * const item, const internal_hooks * const hooks, const cJSON_bool constant_key) +{ + char *new_key = NULL; + int new_type = cJSON_Invalid; + + if ((object == NULL) || (string == NULL) || (item == NULL)) + { + return false; + } + + if (constant_key) + { + new_key = (char*)cast_away_const(string); + new_type = item->type | cJSON_StringIsConst; + } + else + { + new_key = (char*)cJSON_strdup((const unsigned char*)string, hooks); + if (new_key == NULL) + { + return false; + } + + new_type = item->type & ~cJSON_StringIsConst; + } + + if (!(item->type & cJSON_StringIsConst) && (item->string != NULL)) + { + hooks->deallocate(item->string); + } + + item->string = new_key; + item->type = new_type; + + return add_item_to_array(object, item); +} + +CJSON_PUBLIC(void) cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item) +{ + add_item_to_object(object, string, item, &global_hooks, false); +} + +/* Add an item to an object with constant string as key */ +CJSON_PUBLIC(void) cJSON_AddItemToObjectCS(cJSON *object, const char *string, cJSON *item) +{ + add_item_to_object(object, string, item, &global_hooks, true); +} + +CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item) +{ + if (array == NULL) + { + return; + } + + add_item_to_array(array, create_reference(item, &global_hooks)); +} + +CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item) +{ + if ((object == NULL) || (string == NULL)) + { + return; + } + + add_item_to_object(object, string, create_reference(item, &global_hooks), &global_hooks, false); +} + +CJSON_PUBLIC(cJSON*) cJSON_AddNullToObject(cJSON * const object, const char * const name) +{ + cJSON *null = cJSON_CreateNull(); + if (add_item_to_object(object, name, null, &global_hooks, false)) + { + return null; + } + + cJSON_Delete(null); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddTrueToObject(cJSON * const object, const char * const name) +{ + cJSON *true_item = cJSON_CreateTrue(); + if (add_item_to_object(object, name, true_item, &global_hooks, false)) + { + return true_item; + } + + cJSON_Delete(true_item); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddFalseToObject(cJSON * const object, const char * const name) +{ + cJSON *false_item = cJSON_CreateFalse(); + if (add_item_to_object(object, name, false_item, &global_hooks, false)) + { + return false_item; + } + + cJSON_Delete(false_item); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddBoolToObject(cJSON * const object, const char * const name, const cJSON_bool boolean) +{ + cJSON *bool_item = cJSON_CreateBool(boolean); + if (add_item_to_object(object, name, bool_item, &global_hooks, false)) + { + return bool_item; + } + + cJSON_Delete(bool_item); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddNumberToObject(cJSON * const object, const char * const name, const double number) +{ + cJSON *number_item = cJSON_CreateNumber(number); + if (add_item_to_object(object, name, number_item, &global_hooks, false)) + { + return number_item; + } + + cJSON_Delete(number_item); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddStringToObject(cJSON * const object, const char * const name, const char * const string) +{ + cJSON *string_item = cJSON_CreateString(string); + if (add_item_to_object(object, name, string_item, &global_hooks, false)) + { + return string_item; + } + + cJSON_Delete(string_item); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddRawToObject(cJSON * const object, const char * const name, const char * const raw) +{ + cJSON *raw_item = cJSON_CreateRaw(raw); + if (add_item_to_object(object, name, raw_item, &global_hooks, false)) + { + return raw_item; + } + + cJSON_Delete(raw_item); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddObjectToObject(cJSON * const object, const char * const name) +{ + cJSON *object_item = cJSON_CreateObject(); + if (add_item_to_object(object, name, object_item, &global_hooks, false)) + { + return object_item; + } + + cJSON_Delete(object_item); + return NULL; +} + +CJSON_PUBLIC(cJSON*) cJSON_AddArrayToObject(cJSON * const object, const char * const name) +{ + cJSON *array = cJSON_CreateArray(); + if (add_item_to_object(object, name, array, &global_hooks, false)) + { + return array; + } + + cJSON_Delete(array); + return NULL; +} + +CJSON_PUBLIC(cJSON *) cJSON_DetachItemViaPointer(cJSON *parent, cJSON * const item) +{ + if ((parent == NULL) || (item == NULL)) + { + return NULL; + } + + if (item->prev != NULL) + { + /* not the first element */ + item->prev->next = item->next; + } + if (item->next != NULL) + { + /* not the last element */ + item->next->prev = item->prev; + } + + if (item == parent->child) + { + /* first element */ + parent->child = item->next; + } + /* make sure the detached item doesn't point anywhere anymore */ + item->prev = NULL; + item->next = NULL; + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which) +{ + if (which < 0) + { + return NULL; + } + + return cJSON_DetachItemViaPointer(array, get_array_item(array, (size_t)which)); +} + +CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which) +{ + cJSON_Delete(cJSON_DetachItemFromArray(array, which)); +} + +CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObject(cJSON *object, const char *string) +{ + cJSON *to_detach = cJSON_GetObjectItem(object, string); + + return cJSON_DetachItemViaPointer(object, to_detach); +} + +CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string) +{ + cJSON *to_detach = cJSON_GetObjectItemCaseSensitive(object, string); + + return cJSON_DetachItemViaPointer(object, to_detach); +} + +CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string) +{ + cJSON_Delete(cJSON_DetachItemFromObject(object, string)); +} + +CJSON_PUBLIC(void) cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string) +{ + cJSON_Delete(cJSON_DetachItemFromObjectCaseSensitive(object, string)); +} /* Replace array/object items with new ones. */ -void cJSON_InsertItemInArray(cJSON *array,int which,cJSON *newitem) {cJSON *c=array->child;while (c && which>0) c=c->next,which--;if (!c) {cJSON_AddItemToArray(array,newitem);return;} - newitem->next=c;newitem->prev=c->prev;c->prev=newitem;if (c==array->child) array->child=newitem; else newitem->prev->next=newitem;} -void cJSON_ReplaceItemInArray(cJSON *array,int which,cJSON *newitem) {cJSON *c=array->child;while (c && which>0) c=c->next,which--;if (!c) return; - newitem->next=c->next;newitem->prev=c->prev;if (newitem->next) newitem->next->prev=newitem; - if (c==array->child) array->child=newitem; else newitem->prev->next=newitem;c->next=c->prev=0;cJSON_Delete(c);} -void cJSON_ReplaceItemInObject(cJSON *object,const char *string,cJSON *newitem){int i=0;cJSON *c=object->child;while(c && cJSON_strcasecmp(c->string,string))i++,c=c->next;if(c){newitem->string=cJSON_strdup(string);cJSON_ReplaceItemInArray(object,i,newitem);}} +CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem) +{ + cJSON *after_inserted = NULL; + + if (which < 0) + { + return; + } + + after_inserted = get_array_item(array, (size_t)which); + if (after_inserted == NULL) + { + add_item_to_array(array, newitem); + return; + } + + newitem->next = after_inserted; + newitem->prev = after_inserted->prev; + after_inserted->prev = newitem; + if (after_inserted == array->child) + { + array->child = newitem; + } + else + { + newitem->prev->next = newitem; + } +} + +CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemViaPointer(cJSON * const parent, cJSON * const item, cJSON * replacement) +{ + if ((parent == NULL) || (replacement == NULL) || (item == NULL)) + { + return false; + } + + if (replacement == item) + { + return true; + } + + replacement->next = item->next; + replacement->prev = item->prev; + + if (replacement->next != NULL) + { + replacement->next->prev = replacement; + } + if (replacement->prev != NULL) + { + replacement->prev->next = replacement; + } + if (parent->child == item) + { + parent->child = replacement; + } + + item->next = NULL; + item->prev = NULL; + cJSON_Delete(item); + + return true; +} + +CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem) +{ + if (which < 0) + { + return; + } + + cJSON_ReplaceItemViaPointer(array, get_array_item(array, (size_t)which), newitem); +} + +static cJSON_bool replace_item_in_object(cJSON *object, const char *string, cJSON *replacement, cJSON_bool case_sensitive) +{ + if ((replacement == NULL) || (string == NULL)) + { + return false; + } + + /* replace the name in the replacement */ + if (!(replacement->type & cJSON_StringIsConst) && (replacement->string != NULL)) + { + cJSON_free(replacement->string); + } + replacement->string = (char*)cJSON_strdup((const unsigned char*)string, &global_hooks); + replacement->type &= ~cJSON_StringIsConst; + + cJSON_ReplaceItemViaPointer(object, get_object_item(object, string, case_sensitive), replacement); + + return true; +} + +CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem) +{ + replace_item_in_object(object, string, newitem, false); +} + +CJSON_PUBLIC(void) cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object, const char *string, cJSON *newitem) +{ + replace_item_in_object(object, string, newitem, true); +} /* Create basic types: */ -cJSON *cJSON_CreateNull(void) {cJSON *item=cJSON_New_Item();if(item)item->type=cJSON_NULL;return item;} -cJSON *cJSON_CreateTrue(void) {cJSON *item=cJSON_New_Item();if(item)item->type=cJSON_True;return item;} -cJSON *cJSON_CreateFalse(void) {cJSON *item=cJSON_New_Item();if(item)item->type=cJSON_False;return item;} -cJSON *cJSON_CreateBool(int b) {cJSON *item=cJSON_New_Item();if(item)item->type=b?cJSON_True:cJSON_False;return item;} -cJSON *cJSON_CreateNumber(double num) {cJSON *item=cJSON_New_Item();if(item){item->type=cJSON_Number;item->valuedouble=num;item->valueint=(int)num;}return item;} -cJSON *cJSON_CreateString(const char *string) {cJSON *item=cJSON_New_Item();if(item){item->type=cJSON_String;item->valuestring=cJSON_strdup(string);}return item;} -cJSON *cJSON_CreateArray(void) {cJSON *item=cJSON_New_Item();if(item)item->type=cJSON_Array;return item;} -cJSON *cJSON_CreateObject(void) {cJSON *item=cJSON_New_Item();if(item)item->type=cJSON_Object;return item;} +CJSON_PUBLIC(cJSON *) cJSON_CreateNull(void) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_NULL; + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateTrue(void) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_True; + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_False; + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool b) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = b ? cJSON_True : cJSON_False; + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_Number; + item->valuedouble = num; + + /* use saturation in case of overflow */ + if (num >= INT_MAX) + { + item->valueint = INT_MAX; + } + else if (num <= INT_MIN) + { + item->valueint = INT_MIN; + } + else + { + item->valueint = (int)num; + } + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_String; + item->valuestring = (char*)cJSON_strdup((const unsigned char*)string, &global_hooks); + if (!item->valuestring) + { + cJSON_Delete(item); + return NULL; + } + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item != NULL) + { + item->type = cJSON_String | cJSON_IsReference; + item->valuestring = (char*)cast_away_const(string); + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item != NULL) { + item->type = cJSON_Object | cJSON_IsReference; + item->child = (cJSON*)cast_away_const(child); + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child) { + cJSON *item = cJSON_New_Item(&global_hooks); + if (item != NULL) { + item->type = cJSON_Array | cJSON_IsReference; + item->child = (cJSON*)cast_away_const(child); + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_Raw; + item->valuestring = (char*)cJSON_strdup((const unsigned char*)raw, &global_hooks); + if (!item->valuestring) + { + cJSON_Delete(item); + return NULL; + } + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateArray(void) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_Array; + } + + return item; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateObject(void) +{ + cJSON *item = cJSON_New_Item(&global_hooks); + if (item) + { + item->type = cJSON_Object; + } + + return item; +} /* Create Arrays: */ -cJSON *cJSON_CreateIntArray(const int *numbers,int count) {int i;cJSON *n=0,*p=0,*a=cJSON_CreateArray();for(i=0;a && ichild=n;else suffix_object(p,n);p=n;}return a;} -cJSON *cJSON_CreateFloatArray(const float *numbers,int count) {int i;cJSON *n=0,*p=0,*a=cJSON_CreateArray();for(i=0;a && ichild=n;else suffix_object(p,n);p=n;}return a;} -cJSON *cJSON_CreateDoubleArray(const double *numbers,int count) {int i;cJSON *n=0,*p=0,*a=cJSON_CreateArray();for(i=0;a && ichild=n;else suffix_object(p,n);p=n;}return a;} -cJSON *cJSON_CreateStringArray(const char **strings,int count) {int i;cJSON *n=0,*p=0,*a=cJSON_CreateArray();for(i=0;a && ichild=n;else suffix_object(p,n);p=n;}return a;} +CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count) +{ + size_t i = 0; + cJSON *n = NULL; + cJSON *p = NULL; + cJSON *a = NULL; + + if ((count < 0) || (numbers == NULL)) + { + return NULL; + } + + a = cJSON_CreateArray(); + for (i = 0; a && (i < (size_t)count); i++) + { + n = cJSON_CreateNumber(numbers[i]); + if (!n) + { + cJSON_Delete(a); + return NULL; + } + if (!i) + { + a->child = n; + } + else + { + suffix_object(p, n); + } + p = n; + } + + return a; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count) +{ + size_t i = 0; + cJSON *n = NULL; + cJSON *p = NULL; + cJSON *a = NULL; + + if ((count < 0) || (numbers == NULL)) + { + return NULL; + } + + a = cJSON_CreateArray(); + + for (i = 0; a && (i < (size_t)count); i++) + { + n = cJSON_CreateNumber((double)numbers[i]); + if (!n) + { + cJSON_Delete(a); + return NULL; + } + if (!i) + { + a->child = n; + } + else + { + suffix_object(p, n); + } + p = n; + } + + return a; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateDoubleArray(const double *numbers, int count) +{ + size_t i = 0; + cJSON *n = NULL; + cJSON *p = NULL; + cJSON *a = NULL; + + if ((count < 0) || (numbers == NULL)) + { + return NULL; + } + + a = cJSON_CreateArray(); + + for (i = 0; a && (i < (size_t)count); i++) + { + n = cJSON_CreateNumber(numbers[i]); + if (!n) + { + cJSON_Delete(a); + return NULL; + } + if (!i) + { + a->child = n; + } + else + { + suffix_object(p, n); + } + p = n; + } + + return a; +} + +CJSON_PUBLIC(cJSON *) cJSON_CreateStringArray(const char **strings, int count) +{ + size_t i = 0; + cJSON *n = NULL; + cJSON *p = NULL; + cJSON *a = NULL; + + if ((count < 0) || (strings == NULL)) + { + return NULL; + } + + a = cJSON_CreateArray(); + + for (i = 0; a && (i < (size_t)count); i++) + { + n = cJSON_CreateString(strings[i]); + if (!n) + { + cJSON_Delete(a); + return NULL; + } + if (!i) + { + a->child = n; + } + else + { + suffix_object(p, n); + } + p = n; + } + + return a; +} /* Duplication */ -cJSON *cJSON_Duplicate(cJSON *item,int recurse) -{ - cJSON *newitem,*cptr,*nptr=0,*newchild; - /* Bail on bad ptr */ - if (!item) return 0; - /* Create new item */ - newitem=cJSON_New_Item(); - if (!newitem) return 0; - /* Copy over all vars */ - newitem->type=item->type&(~cJSON_IsReference),newitem->valueint=item->valueint,newitem->valuedouble=item->valuedouble; - if (item->valuestring) {newitem->valuestring=cJSON_strdup(item->valuestring); if (!newitem->valuestring) {cJSON_Delete(newitem);return 0;}} - if (item->string) {newitem->string=cJSON_strdup(item->string); if (!newitem->string) {cJSON_Delete(newitem);return 0;}} - /* If non-recursive, then we're done! */ - if (!recurse) return newitem; - /* Walk the ->next chain for the child. */ - cptr=item->child; - while (cptr) - { - newchild=cJSON_Duplicate(cptr,1); /* Duplicate (with recurse) each item in the ->next chain */ - if (!newchild) {cJSON_Delete(newitem);return 0;} - if (nptr) {nptr->next=newchild,newchild->prev=nptr;nptr=newchild;} /* If newitem->child already set, then crosswire ->prev and ->next and move on */ - else {newitem->child=newchild;nptr=newchild;} /* Set newitem->child and move to it */ - cptr=cptr->next; - } - return newitem; -} - -void cJSON_Minify(char *json) -{ - char *into=json; - while (*json) - { - if (*json==' ') json++; - else if (*json=='\t') json++; /* Whitespace characters. */ - else if (*json=='\r') json++; - else if (*json=='\n') json++; - else if (*json=='/' && json[1]=='/') while (*json && *json!='\n') json++; /* double-slash comments, to end of line. */ - else if (*json=='/' && json[1]=='*') {while (*json && !(*json=='*' && json[1]=='/')) json++;json+=2;} /* multiline comments. */ - else if (*json=='\"'){*into++=*json++;while (*json && *json!='\"'){if (*json=='\\') *into++=*json++;*into++=*json++;}*into++=*json++;} /* string literals, which are \" sensitive. */ - else *into++=*json++; /* All other characters. */ - } - *into=0; /* and null-terminate. */ +CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse) +{ + cJSON *newitem = NULL; + cJSON *child = NULL; + cJSON *next = NULL; + cJSON *newchild = NULL; + + /* Bail on bad ptr */ + if (!item) + { + goto fail; + } + /* Create new item */ + newitem = cJSON_New_Item(&global_hooks); + if (!newitem) + { + goto fail; + } + /* Copy over all vars */ + newitem->type = item->type & (~cJSON_IsReference); + newitem->valueint = item->valueint; + newitem->valuedouble = item->valuedouble; + if (item->valuestring) + { + newitem->valuestring = (char*)cJSON_strdup((unsigned char*)item->valuestring, &global_hooks); + if (!newitem->valuestring) + { + goto fail; + } + } + if (item->string) + { + newitem->string = (item->type&cJSON_StringIsConst) ? item->string : (char*)cJSON_strdup((unsigned char*)item->string, &global_hooks); + if (!newitem->string) + { + goto fail; + } + } + /* If non-recursive, then we're done! */ + if (!recurse) + { + return newitem; + } + /* Walk the ->next chain for the child. */ + child = item->child; + while (child != NULL) + { + newchild = cJSON_Duplicate(child, true); /* Duplicate (with recurse) each item in the ->next chain */ + if (!newchild) + { + goto fail; + } + if (next != NULL) + { + /* If newitem->child already set, then crosswire ->prev and ->next and move on */ + next->next = newchild; + newchild->prev = next; + next = newchild; + } + else + { + /* Set newitem->child and move to it */ + newitem->child = newchild; + next = newchild; + } + child = child->next; + } + + return newitem; + +fail: + if (newitem != NULL) + { + cJSON_Delete(newitem); + } + + return NULL; +} + +CJSON_PUBLIC(void) cJSON_Minify(char *json) +{ + unsigned char *into = (unsigned char*)json; + + if (json == NULL) + { + return; + } + + while (*json) + { + if (*json == ' ') + { + json++; + } + else if (*json == '\t') + { + /* Whitespace characters. */ + json++; + } + else if (*json == '\r') + { + json++; + } + else if (*json == '\n') + { + json++; + } + else if ((*json == '/') && (json[1] == '/')) + { + /* double-slash comments, to end of line. */ + while (*json && (*json != '\n')) + { + json++; + } + } + else if ((*json == '/') && (json[1] == '*')) + { + /* multiline comments. */ + while (*json && !((*json == '*') && (json[1] == '/'))) + { + json++; + } + json += 2; + } + else if (*json == '\"') + { + /* string literals, which are \" sensitive. */ + *into++ = (unsigned char)*json++; + while (*json && (*json != '\"')) + { + if (*json == '\\') + { + *into++ = (unsigned char)*json++; + } + *into++ = (unsigned char)*json++; + } + *into++ = (unsigned char)*json++; + } + else + { + /* All other characters. */ + *into++ = (unsigned char)*json++; + } + } + + /* and null-terminate. */ + *into = '\0'; } +CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_Invalid; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_False; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xff) == cJSON_True; +} + + +CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & (cJSON_True | cJSON_False)) != 0; +} +CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_NULL; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_Number; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_String; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_Array; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_Object; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON * const item) +{ + if (item == NULL) + { + return false; + } + + return (item->type & 0xFF) == cJSON_Raw; +} + +CJSON_PUBLIC(cJSON_bool) cJSON_Compare(const cJSON * const a, const cJSON * const b, const cJSON_bool case_sensitive) +{ + if ((a == NULL) || (b == NULL) || ((a->type & 0xFF) != (b->type & 0xFF)) || cJSON_IsInvalid(a)) + { + return false; + } + + /* check if type is valid */ + switch (a->type & 0xFF) + { + case cJSON_False: + case cJSON_True: + case cJSON_NULL: + case cJSON_Number: + case cJSON_String: + case cJSON_Raw: + case cJSON_Array: + case cJSON_Object: + break; + + default: + return false; + } + + /* identical objects are equal */ + if (a == b) + { + return true; + } + + switch (a->type & 0xFF) + { + /* in these cases and equal type is enough */ + case cJSON_False: + case cJSON_True: + case cJSON_NULL: + return true; + + case cJSON_Number: + if (a->valuedouble == b->valuedouble) + { + return true; + } + return false; + + case cJSON_String: + case cJSON_Raw: + if ((a->valuestring == NULL) || (b->valuestring == NULL)) + { + return false; + } + if (strcmp(a->valuestring, b->valuestring) == 0) + { + return true; + } + + return false; + + case cJSON_Array: + { + cJSON *a_element = a->child; + cJSON *b_element = b->child; + + for (; (a_element != NULL) && (b_element != NULL);) + { + if (!cJSON_Compare(a_element, b_element, case_sensitive)) + { + return false; + } + + a_element = a_element->next; + b_element = b_element->next; + } + + /* one of the arrays is longer than the other */ + if (a_element != b_element) { + return false; + } + + return true; + } + + case cJSON_Object: + { + cJSON *a_element = NULL; + cJSON *b_element = NULL; + cJSON_ArrayForEach(a_element, a) + { + /* TODO This has O(n^2) runtime, which is horrible! */ + b_element = get_object_item(b, a_element->string, case_sensitive); + if (b_element == NULL) + { + return false; + } + + if (!cJSON_Compare(a_element, b_element, case_sensitive)) + { + return false; + } + } + + /* doing this twice, once on a and b to prevent true comparison if a subset of b + * TODO: Do this the proper way, this is just a fix for now */ + cJSON_ArrayForEach(b_element, b) + { + a_element = get_object_item(a, b_element->string, case_sensitive); + if (a_element == NULL) + { + return false; + } + + if (!cJSON_Compare(b_element, a_element, case_sensitive)) + { + return false; + } + } + + return true; + } + + default: + return false; + } +} + +CJSON_PUBLIC(void *) cJSON_malloc(size_t size) +{ + return global_hooks.allocate(size); +} + +CJSON_PUBLIC(void) cJSON_free(void *object) +{ + global_hooks.deallocate(object); +} \ No newline at end of file diff --git a/source/code/cjson/cJSON.h b/source/code/cjson/cJSON.h index 662948612..d4a2dfed3 100644 --- a/source/code/cjson/cJSON.h +++ b/source/code/cjson/cJSON.h @@ -1,147 +1,285 @@ /* - Copyright (c) 2009 Dave Gamble - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. +Copyright (c) 2009-2017 Dave Gamble and cJSON contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. */ #ifndef cJSON__h #define cJSON__h + #ifdef __cplusplus extern "C" { #endif -/* cJSON Types: */ -#define cJSON_False 0 -#define cJSON_True 1 -#define cJSON_NULL 2 -#define cJSON_Number 3 -#define cJSON_String 4 -#define cJSON_Array 5 -#define cJSON_Object 6 - +#if !defined(__WINDOWS__) && (defined(WIN32) || defined(WIN64) || defined(_MSC_VER) || defined(_WIN32)) +#define __WINDOWS__ +#endif + +#ifdef __WINDOWS__ + + /* When compiling for windows, we specify a specific calling convention to avoid issues where we are being called from a project with a different default calling convention. For windows you have 3 define options: + + CJSON_HIDE_SYMBOLS - Define this in the case where you don't want to ever dllexport symbols + CJSON_EXPORT_SYMBOLS - Define this on library build when you want to dllexport symbols (default) + CJSON_IMPORT_SYMBOLS - Define this if you want to dllimport symbol + + For *nix builds that support visibility attribute, you can define similar behavior by + + setting default visibility to hidden by adding + -fvisibility=hidden (for gcc) + or + -xldscope=hidden (for sun cc) + to CFLAGS + + then using the CJSON_API_VISIBILITY flag to "export" the same symbols the way CJSON_EXPORT_SYMBOLS does + + */ + +#define CJSON_CDECL __cdecl +#define CJSON_STDCALL __stdcall + + /* export symbols by default, this is necessary for copy pasting the C and header file */ +#if !defined(CJSON_HIDE_SYMBOLS) && !defined(CJSON_IMPORT_SYMBOLS) && !defined(CJSON_EXPORT_SYMBOLS) +#define CJSON_EXPORT_SYMBOLS +#endif + +#if defined(CJSON_HIDE_SYMBOLS) +#define CJSON_PUBLIC(type) type CJSON_STDCALL +#elif defined(CJSON_EXPORT_SYMBOLS) +#define CJSON_PUBLIC(type) __declspec(dllexport) type CJSON_STDCALL +#elif defined(CJSON_IMPORT_SYMBOLS) +#define CJSON_PUBLIC(type) __declspec(dllimport) type CJSON_STDCALL +#endif +#else /* !__WINDOWS__ */ +#define CJSON_CDECL +#define CJSON_STDCALL + +#if (defined(__GNUC__) || defined(__SUNPRO_CC) || defined (__SUNPRO_C)) && defined(CJSON_API_VISIBILITY) +#define CJSON_PUBLIC(type) __attribute__((visibility("default"))) type +#else +#define CJSON_PUBLIC(type) type +#endif +#endif + + /* project version */ +#define CJSON_VERSION_MAJOR 1 +#define CJSON_VERSION_MINOR 7 +#define CJSON_VERSION_PATCH 8 + +#include + + /* cJSON Types: */ +#define cJSON_Invalid (0) +#define cJSON_False (1 << 0) +#define cJSON_True (1 << 1) +#define cJSON_NULL (1 << 2) +#define cJSON_Number (1 << 3) +#define cJSON_String (1 << 4) +#define cJSON_Array (1 << 5) +#define cJSON_Object (1 << 6) +#define cJSON_Raw (1 << 7) /* raw json */ + #define cJSON_IsReference 256 #define cJSON_StringIsConst 512 -/* The cJSON structure: */ - typedef struct cJSON { - struct cJSON *next,*prev; /* next/prev allow you to walk array/object chains. Alternatively, use GetArraySize/GetArrayItem/GetObjectItem */ - struct cJSON *child; /* An array or object item will have a child pointer pointing to a chain of the items in the array/object. */ - - int type; /* The type of the item, as above. */ - - char *valuestring; /* The item's string, if type==cJSON_String */ - int valueint; /* The item's number, if type==cJSON_Number */ - double valuedouble; /* The item's number, if type==cJSON_Number */ - - char *string; /* The item's name string, if this item is the child of, or is in the list of subitems of an object. */ - } cJSON; - - typedef struct cJSON_Hooks { - void *(*malloc_fn)(size_t sz); - void (*free_fn)(void *ptr); - } cJSON_Hooks; - -/* Supply malloc, realloc and free functions to cJSON */ - extern void cJSON_InitHooks(cJSON_Hooks* hooks); - - -/* Supply a block of JSON, and this returns a cJSON object you can interrogate. Call cJSON_Delete when finished. */ - extern cJSON *cJSON_Parse(const char *value); -/* Render a cJSON entity to text for transfer/storage. Free the char* when finished. */ - extern char *cJSON_Print(cJSON *item); -/* Render a cJSON entity to text for transfer/storage without any formatting. Free the char* when finished. */ - extern char *cJSON_PrintUnformatted(cJSON *item); -/* Render a cJSON entity to text using a buffered strategy. prebuffer is a guess at the final size. guessing well reduces reallocation. fmt=0 gives unformatted, =1 gives formatted */ - extern char *cJSON_PrintBuffered(cJSON *item,int prebuffer,int fmt); -/* Delete a cJSON entity and all subentities. */ - extern void cJSON_Delete(cJSON *c); - -/* Returns the number of items in an array (or object). */ - extern int cJSON_GetArraySize(cJSON *array); -/* Retrieve item number "item" from array "array". Returns NULL if unsuccessful. */ - extern cJSON *cJSON_GetArrayItem(cJSON *array,int item); -/* Get item "string" from object. Case insensitive. */ - extern cJSON *cJSON_GetObjectItem(cJSON *object,const char *string); - -/* For analysing failed parses. This returns a pointer to the parse error. You'll probably need to look a few chars back to make sense of it. Defined when cJSON_Parse() returns 0. 0 when cJSON_Parse() succeeds. */ - extern const char *cJSON_GetErrorPtr(void); - -/* These calls create a cJSON item of the appropriate type. */ - extern cJSON *cJSON_CreateNull(void); - extern cJSON *cJSON_CreateTrue(void); - extern cJSON *cJSON_CreateFalse(void); - extern cJSON *cJSON_CreateBool(int b); - extern cJSON *cJSON_CreateNumber(double num); - extern cJSON *cJSON_CreateString(const char *string); - extern cJSON *cJSON_CreateArray(void); - extern cJSON *cJSON_CreateObject(void); - -/* These utilities create an Array of count items. */ - extern cJSON *cJSON_CreateIntArray(const int *numbers,int count); - extern cJSON *cJSON_CreateFloatArray(const float *numbers,int count); - extern cJSON *cJSON_CreateDoubleArray(const double *numbers,int count); - extern cJSON *cJSON_CreateStringArray(const char **strings,int count); - -/* Append item to the specified array/object. */ - extern void cJSON_AddItemToArray(cJSON *array, cJSON *item); - extern void cJSON_AddItemToObject(cJSON *object,const char *string,cJSON *item); - extern void cJSON_AddItemToObjectCS(cJSON *object,const char *string,cJSON *item); /* Use this when string is definitely const (i.e. a literal, or as good as), and will definitely survive the cJSON object */ -/* Append reference to item to the specified array/object. Use this when you want to add an existing cJSON to a new cJSON, but don't want to corrupt your existing cJSON. */ - extern void cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item); - extern void cJSON_AddItemReferenceToObject(cJSON *object,const char *string,cJSON *item); - -/* Remove/Detatch items from Arrays/Objects. */ - extern cJSON *cJSON_DetachItemFromArray(cJSON *array,int which); - extern void cJSON_DeleteItemFromArray(cJSON *array,int which); - extern cJSON *cJSON_DetachItemFromObject(cJSON *object,const char *string); - extern void cJSON_DeleteItemFromObject(cJSON *object,const char *string); - -/* Update array items. */ - extern void cJSON_InsertItemInArray(cJSON *array,int which,cJSON *newitem); /* Shifts pre-existing items to the right. */ - extern void cJSON_ReplaceItemInArray(cJSON *array,int which,cJSON *newitem); - extern void cJSON_ReplaceItemInObject(cJSON *object,const char *string,cJSON *newitem); - -/* Duplicate a cJSON item */ - extern cJSON *cJSON_Duplicate(cJSON *item,int recurse); -/* Duplicate will create a new, identical cJSON item to the one you pass, in new memory that will - need to be released. With recurse!=0, it will duplicate any children connected to the item. - The item->next and ->prev pointers are always zero on return from Duplicate. */ - -/* ParseWithOpts allows you to require (and check) that the JSON is null terminated, and to retrieve the pointer to the final byte parsed. */ - extern cJSON *cJSON_ParseWithOpts(const char *value,const char **return_parse_end,int require_null_terminated); - - extern void cJSON_Minify(char *json); - -/* Macros for creating things quickly. */ -#define cJSON_AddNullToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateNull()) -#define cJSON_AddTrueToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateTrue()) -#define cJSON_AddFalseToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateFalse()) -#define cJSON_AddBoolToObject(object,name,b) cJSON_AddItemToObject(object, name, cJSON_CreateBool(b)) -#define cJSON_AddNumberToObject(object,name,n) cJSON_AddItemToObject(object, name, cJSON_CreateNumber(n)) -#define cJSON_AddStringToObject(object,name,s) cJSON_AddItemToObject(object, name, cJSON_CreateString(s)) - -/* When assigning an integer value, it needs to be propagated to valuedouble too. */ -#define cJSON_SetIntValue(object,val) ((object)?(object)->valueint=(object)->valuedouble=(val):(val)) -#define cJSON_SetNumberValue(object,val) ((object)?(object)->valueint=(object)->valuedouble=(val):(val)) + /* The cJSON structure: */ + typedef struct cJSON + { + /* next/prev allow you to walk array/object chains. Alternatively, use GetArraySize/GetArrayItem/GetObjectItem */ + struct cJSON *next; + struct cJSON *prev; + /* An array or object item will have a child pointer pointing to a chain of the items in the array/object. */ + struct cJSON *child; + + /* The type of the item, as above. */ + int type; + + /* The item's string, if type==cJSON_String and type == cJSON_Raw */ + char *valuestring; + /* writing to valueint is DEPRECATED, use cJSON_SetNumberValue instead */ + int valueint; + /* The item's number, if type==cJSON_Number */ + double valuedouble; + + /* The item's name string, if this item is the child of, or is in the list of subitems of an object. */ + char *string; + } cJSON; + + typedef struct cJSON_Hooks + { + /* malloc/free are CDECL on Windows regardless of the default calling convention of the compiler, so ensure the hooks allow passing those functions directly. */ + void *(CJSON_CDECL *malloc_fn)(size_t sz); + void (CJSON_CDECL *free_fn)(void *ptr); + } cJSON_Hooks; + + typedef int cJSON_bool; + + /* Limits how deeply nested arrays/objects can be before cJSON rejects to parse them. + * This is to prevent stack overflows. */ +#ifndef CJSON_NESTING_LIMIT +#define CJSON_NESTING_LIMIT 1000 +#endif + + /* returns the version of cJSON as a string */ + CJSON_PUBLIC(const char*) cJSON_Version(void); + + /* Supply malloc, realloc and free functions to cJSON */ + CJSON_PUBLIC(void) cJSON_InitHooks(cJSON_Hooks* hooks); + + /* Memory Management: the caller is always responsible to free the results from all variants of cJSON_Parse (with cJSON_Delete) and cJSON_Print (with stdlib free, cJSON_Hooks.free_fn, or cJSON_free as appropriate). The exception is cJSON_PrintPreallocated, where the caller has full responsibility of the buffer. */ + /* Supply a block of JSON, and this returns a cJSON object you can interrogate. */ + CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value); + /* ParseWithOpts allows you to require (and check) that the JSON is null terminated, and to retrieve the pointer to the final byte parsed. */ + /* If you supply a ptr in return_parse_end and parsing fails, then return_parse_end will contain a pointer to the error so will match cJSON_GetErrorPtr(). */ + CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated); + + /* Render a cJSON entity to text for transfer/storage. */ + CJSON_PUBLIC(char *) cJSON_Print(const cJSON *item); + /* Render a cJSON entity to text for transfer/storage without any formatting. */ + CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item); + /* Render a cJSON entity to text using a buffered strategy. prebuffer is a guess at the final size. guessing well reduces reallocation. fmt=0 gives unformatted, =1 gives formatted */ + CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt); + /* Render a cJSON entity to text using a buffer already allocated in memory with given length. Returns 1 on success and 0 on failure. */ + /* NOTE: cJSON is not always 100% accurate in estimating how much memory it will use, so to be safe allocate 5 bytes more than you actually need */ + CJSON_PUBLIC(cJSON_bool) cJSON_PrintPreallocated(cJSON *item, char *buffer, const int length, const cJSON_bool format); + /* Delete a cJSON entity and all subentities. */ + CJSON_PUBLIC(void) cJSON_Delete(cJSON *c); + + /* Returns the number of items in an array (or object). */ + CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array); + /* Retrieve item number "index" from array "array". Returns NULL if unsuccessful. */ + CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index); + /* Get item "string" from object. Case insensitive. */ + CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON * const object, const char * const string); + CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string); + CJSON_PUBLIC(cJSON_bool) cJSON_HasObjectItem(const cJSON *object, const char *string); + /* For analysing failed parses. This returns a pointer to the parse error. You'll probably need to look a few chars back to make sense of it. Defined when cJSON_Parse() returns 0. 0 when cJSON_Parse() succeeds. */ + CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void); + + /* Check if the item is a string and return its valuestring */ + CJSON_PUBLIC(char *) cJSON_GetStringValue(cJSON *item); + + /* These functions check the type of an item */ + CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON * const item); + CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON * const item); + + /* These calls create a cJSON item of the appropriate type. */ + CJSON_PUBLIC(cJSON *) cJSON_CreateNull(void); + CJSON_PUBLIC(cJSON *) cJSON_CreateTrue(void); + CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void); + CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool boolean); + CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num); + CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string); + /* raw json */ + CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw); + CJSON_PUBLIC(cJSON *) cJSON_CreateArray(void); + CJSON_PUBLIC(cJSON *) cJSON_CreateObject(void); + + /* Create a string where valuestring references a string so + * it will not be freed by cJSON_Delete */ + CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string); + /* Create an object/arrray that only references it's elements so + * they will not be freed by cJSON_Delete */ + CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child); + CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child); + + /* These utilities create an Array of count items. */ + CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count); + CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count); + CJSON_PUBLIC(cJSON *) cJSON_CreateDoubleArray(const double *numbers, int count); + CJSON_PUBLIC(cJSON *) cJSON_CreateStringArray(const char **strings, int count); + + /* Append item to the specified array/object. */ + CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item); + CJSON_PUBLIC(void) cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item); + /* Use this when string is definitely const (i.e. a literal, or as good as), and will definitely survive the cJSON object. + * WARNING: When this function was used, make sure to always check that (item->type & cJSON_StringIsConst) is zero before + * writing to `item->string` */ + CJSON_PUBLIC(void) cJSON_AddItemToObjectCS(cJSON *object, const char *string, cJSON *item); + /* Append reference to item to the specified array/object. Use this when you want to add an existing cJSON to a new cJSON, but don't want to corrupt your existing cJSON. */ + CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item); + CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item); + + /* Remove/Detatch items from Arrays/Objects. */ + CJSON_PUBLIC(cJSON *) cJSON_DetachItemViaPointer(cJSON *parent, cJSON * const item); + CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which); + CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which); + CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObject(cJSON *object, const char *string); + CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string); + CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string); + CJSON_PUBLIC(void) cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string); + + /* Update array items. */ + CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem); /* Shifts pre-existing items to the right. */ + CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemViaPointer(cJSON * const parent, cJSON * const item, cJSON * replacement); + CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem); + CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem); + CJSON_PUBLIC(void) cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object, const char *string, cJSON *newitem); + + /* Duplicate a cJSON item */ + CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse); + /* Duplicate will create a new, identical cJSON item to the one you pass, in new memory that will + need to be released. With recurse!=0, it will duplicate any children connected to the item. + The item->next and ->prev pointers are always zero on return from Duplicate. */ + /* Recursively compare two cJSON items for equality. If either a or b is NULL or invalid, they will be considered unequal. + * case_sensitive determines if object keys are treated case sensitive (1) or case insensitive (0) */ + CJSON_PUBLIC(cJSON_bool) cJSON_Compare(const cJSON * const a, const cJSON * const b, const cJSON_bool case_sensitive); + + + CJSON_PUBLIC(void) cJSON_Minify(char *json); + + /* Helper functions for creating and adding items to an object at the same time. + * They return the added item or NULL on failure. */ + CJSON_PUBLIC(cJSON*) cJSON_AddNullToObject(cJSON * const object, const char * const name); + CJSON_PUBLIC(cJSON*) cJSON_AddTrueToObject(cJSON * const object, const char * const name); + CJSON_PUBLIC(cJSON*) cJSON_AddFalseToObject(cJSON * const object, const char * const name); + CJSON_PUBLIC(cJSON*) cJSON_AddBoolToObject(cJSON * const object, const char * const name, const cJSON_bool boolean); + CJSON_PUBLIC(cJSON*) cJSON_AddNumberToObject(cJSON * const object, const char * const name, const double number); + CJSON_PUBLIC(cJSON*) cJSON_AddStringToObject(cJSON * const object, const char * const name, const char * const string); + CJSON_PUBLIC(cJSON*) cJSON_AddRawToObject(cJSON * const object, const char * const name, const char * const raw); + CJSON_PUBLIC(cJSON*) cJSON_AddObjectToObject(cJSON * const object, const char * const name); + CJSON_PUBLIC(cJSON*) cJSON_AddArrayToObject(cJSON * const object, const char * const name); + + /* When assigning an integer value, it needs to be propagated to valuedouble too. */ +#define cJSON_SetIntValue(object, number) ((object) ? (object)->valueint = (object)->valuedouble = (number) : (number)) + /* helper for the cJSON_SetNumberValue macro */ + CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number); +#define cJSON_SetNumberValue(object, number) ((object != NULL) ? cJSON_SetNumberHelper(object, (double)number) : (number)) + + /* Macro for iterating over an array or object */ +#define cJSON_ArrayForEach(element, array) for(element = (array != NULL) ? (array)->child : NULL; element != NULL; element = element->next) + + /* malloc/free objects using the malloc/free functions that have been set with cJSON_InitHooks */ + CJSON_PUBLIC(void *) cJSON_malloc(size_t size); + CJSON_PUBLIC(void) cJSON_free(void *object); #ifdef __cplusplus } #endif -#endif + +#endif \ No newline at end of file diff --git a/source/code/providers/Container_ContainerInventory_Class_Provider.cpp b/source/code/providers/Container_ContainerInventory_Class_Provider.cpp index 7fdd746a1..68c13053a 100644 --- a/source/code/providers/Container_ContainerInventory_Class_Provider.cpp +++ b/source/code/providers/Container_ContainerInventory_Class_Provider.cpp @@ -103,11 +103,11 @@ class ContainerQuery { cJSON* entry = cJSON_GetArrayItem(response[0], i); - if (entry) + if (entry != NULL) { cJSON* tags = cJSON_GetObjectItem(entry, "RepoTags"); - if (tags && cJSON_GetArraySize(tags)) + if ((tags != NULL) && cJSON_GetArraySize(tags)) { string value = ""; cJSON* arrItem = cJSON_GetArrayItem(tags, 0); @@ -168,7 +168,7 @@ class ContainerQuery try { cJSON* config = cJSON_GetObjectItem(entry, "Config"); - if (config) + if (config != NULL) { // Hostname of container string hostnamevalue = ""; @@ -232,11 +232,11 @@ class ContainerQuery // Compose group instance.ComposeGroup_value(""); - if (labels) + if (labels != NULL) { cJSON* groupName = cJSON_GetObjectItem(labels, "com.docker.compose.project"); - if (groupName) + if (groupName != NULL) { instance.ComposeGroup_value(groupName->valuestring); } @@ -244,7 +244,10 @@ class ContainerQuery } else { - syslog(LOG_WARNING, "Attempt in ObtainContainerConfig to get container %s config information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + if (cJSON_GetObjectItem(entry, "Id") != NULL) + { + syslog(LOG_WARNING, "Attempt in ObtainContainerConfig to get container %s config information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + } } } catch (std::exception &e) @@ -268,7 +271,7 @@ class ContainerQuery try { cJSON* state = cJSON_GetObjectItem(entry, "State"); - if (state) + if (state != NULL) { cJSON* objItem = cJSON_GetObjectItem(state, "ExitCode"); if (objItem != NULL) @@ -278,7 +281,10 @@ class ContainerQuery if (exitCode < 0) { exitCode = 128; - syslog(LOG_NOTICE, "Container %s returned negative exit code", cJSON_GetObjectItem(entry, "Id")->valuestring); + if (cJSON_GetObjectItem(entry, "Id") != NULL) + { + syslog(LOG_NOTICE, "Container %s returned negative exit code", cJSON_GetObjectItem(entry, "Id")->valuestring); + } } instance.ExitCode_value(exitCode); @@ -328,7 +334,10 @@ class ContainerQuery } else { - syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + if (cJSON_GetObjectItem(entry, "Id")) + { + syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + } } } catch (std::exception &e) @@ -352,7 +361,7 @@ class ContainerQuery try { cJSON* hostConfig = cJSON_GetObjectItem(entry, "HostConfig"); - if (hostConfig) + if (hostConfig != NULL) { // Links cJSON* objItem = cJSON_GetObjectItem(hostConfig, "Links"); @@ -372,7 +381,10 @@ class ContainerQuery } else { - syslog(LOG_WARNING, "Attempt in ObtainContainerHostConfig to get container %s host config information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + if (cJSON_GetObjectItem(entry, "Id")) + { + syslog(LOG_WARNING, "Attempt in ObtainContainerHostConfig to get container %s host config information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + } } } catch (std::exception &e) diff --git a/source/code/providers/Container_ContainerStatistics_Class_Provider.cpp b/source/code/providers/Container_ContainerStatistics_Class_Provider.cpp index c43057ec7..08b68b1d8 100644 --- a/source/code/providers/Container_ContainerStatistics_Class_Provider.cpp +++ b/source/code/providers/Container_ContainerStatistics_Class_Provider.cpp @@ -34,17 +34,17 @@ class StatsQuery int totalRx = 0; int totalTx = 0; - if (stats) + if (stats != NULL) { cJSON* network = cJSON_GetObjectItem(stats, "networks"); - if (network) + if (network != NULL) { // Docker 1.9+ network = network->child; // Sum the number of bytes from each NIC if there is more than one - while (network) + while (network != NULL) { cJSON* objItem = cJSON_GetObjectItem(network, "rx_bytes"); if (objItem != NULL) { @@ -66,7 +66,7 @@ class StatsQuery { // Docker 1.8.x network = cJSON_GetObjectItem(stats, "network"); - if (network) + if (network != NULL) { cJSON* objItem = cJSON_GetObjectItem(network, "rx_bytes"); if (objItem != NULL) { @@ -110,7 +110,7 @@ class StatsQuery static void TrySetContainerMemoryData(Container_ContainerStatistics_Class& instance, cJSON* stats) { try { - if (stats) + if (stats != NULL) { cJSON* memory_stats = cJSON_GetObjectItem(stats, "memory_stats"); if (memory_stats != NULL) { @@ -150,27 +150,27 @@ class StatsQuery instance.DiskBytesRead_value(0); instance.DiskBytesWritten_value(0); - if (stats) + if (stats != NULL) { cJSON* blkio_stats = cJSON_GetObjectItem(stats, "blkio_stats"); - if (blkio_stats) + if (blkio_stats != NULL) { cJSON* values = cJSON_GetObjectItem(blkio_stats, "io_service_bytes_recursive"); bool readFlag = false; bool writeFlag = false; - for (int i = 0; values && !(readFlag && writeFlag) && i < cJSON_GetArraySize(values); i++) + for (int i = 0; values != NULL && !(readFlag && writeFlag) && i < cJSON_GetArraySize(values); i++) { cJSON* entry = cJSON_GetArrayItem(values, i); - if (entry) + if (entry != NULL) { cJSON* op = cJSON_GetObjectItem(entry, "op"); cJSON* rawValue = cJSON_GetObjectItem(entry, "value"); - if (op && rawValue) + if ((op != NULL) && (rawValue != NULL)) { if (!strcmp(op->valuestring, "Read")) { @@ -215,15 +215,15 @@ class StatsQuery result["system"] = 0; try { - if (stats) + if (stats != NULL) { cJSON* cpu_stats = cJSON_GetObjectItem(stats, "cpu_stats"); - if (cpu_stats) + if (cpu_stats != NULL) { cJSON* cpu_usage = cJSON_GetObjectItem(cpu_stats, "cpu_usage"); - if (cpu_usage) + if (cpu_usage != NULL) { cJSON* objItem = cJSON_GetObjectItem(cpu_usage, "total_usage"); if (objItem != NULL) { @@ -269,15 +269,15 @@ class StatsQuery instance.CPUTotal_value(0); instance.CPUTotalPct_value(0); - if (stats) + if (stats != NULL) { cJSON* cpu_stats = cJSON_GetObjectItem(stats, "cpu_stats"); - if (cpu_stats) + if (cpu_stats != NULL) { cJSON* cpu_usage = cJSON_GetObjectItem(cpu_stats, "cpu_usage"); - if (cpu_usage) + if (cpu_usage != NULL) { cJSON* totalUsageItem = cJSON_GetObjectItem(cpu_usage, "total_usage"); cJSON* systemCpuUsageItem = cJSON_GetObjectItem(cpu_stats, "system_cpu_usage"); @@ -333,7 +333,7 @@ class StatsQuery { cJSON* entry = cJSON_GetArrayItem(response[0], i); - if (entry) + if (entry != NULL) { // New perf entry Container_ContainerStatistics_Class instance; @@ -396,7 +396,10 @@ class StatsQuery // See http://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#get-container-stats-based-on-resource-usage for example output if (!subResponse.empty() && subResponse[0]) { - TrySetContainerCpuData(result[i], subResponse[0], previousStatsList[i]); + if (i < previousStatsList.size()) + { + TrySetContainerCpuData(result[i], subResponse[0], previousStatsList[i]); + } // Set container name in 'InstanceName' field of Perf data. result[i].InstanceID_value(result[i].ElementName_value()); diff --git a/source/code/providers/Container_DaemonEvent_Class_Provider.cpp b/source/code/providers/Container_DaemonEvent_Class_Provider.cpp index d5d2ce6f2..bf2ab3b53 100644 --- a/source/code/providers/Container_DaemonEvent_Class_Provider.cpp +++ b/source/code/providers/Container_DaemonEvent_Class_Provider.cpp @@ -137,11 +137,11 @@ class EventQuery { cJSON* entry = cJSON_GetArrayItem(response[0], i); - if (entry) + if (entry != NULL) { cJSON* nameField = cJSON_GetObjectItem(entry, "Names"); - if (nameField && cJSON_GetArraySize(nameField)) + if ((nameField != NULL) && cJSON_GetArraySize(nameField)) { // Docker API documentation says that this field contains the short ID but that is not the case; use full ID instead cJSON* objItem = cJSON_GetObjectItem(entry, "Id"); @@ -239,7 +239,7 @@ class EventQuery cJSON* entry = cJSON_GetArrayItem(response[0], i); // the newer versions of the API may return objects that do not have status or id - if (entry && cJSON_GetObjectItem(entry, "status") != NULL && cJSON_GetObjectItem(entry, "id") != NULL) + if ((entry != NULL) && cJSON_GetObjectItem(entry, "status") != NULL && cJSON_GetObjectItem(entry, "id") != NULL) { // New inventory entry Container_DaemonEvent_Class instance; diff --git a/source/code/providers/Container_ImageInventory_Class_Provider.cpp b/source/code/providers/Container_ImageInventory_Class_Provider.cpp index 3cc088683..01d1c639c 100644 --- a/source/code/providers/Container_ImageInventory_Class_Provider.cpp +++ b/source/code/providers/Container_ImageInventory_Class_Provider.cpp @@ -35,7 +35,7 @@ class InventoryQuery string result = ""; try { - if (tags && cJSON_GetArraySize(tags)) + if ((tags != NULL) && cJSON_GetArraySize(tags)) { bool flag = false; @@ -164,7 +164,7 @@ class InventoryQuery try { cJSON* state = cJSON_GetObjectItem(entry, "State"); - if (state) + if (state != NULL) { cJSON* objItem = cJSON_GetObjectItem(entry, "Image"); if (objItem != NULL) @@ -173,10 +173,10 @@ class InventoryQuery { string id = string(objItem->valuestring); - if (cJSON_GetObjectItem(state, "Running")->valueint) + if (cJSON_GetObjectItem(state, "Running") != NULL && cJSON_GetObjectItem(state, "Running")->valueint) { // Running container - if (cJSON_GetObjectItem(state, "Paused")->valueint) + if (cJSON_GetObjectItem(state, "Paused") != NULL && cJSON_GetObjectItem(state, "Paused")->valueint) { // Paused container instances[idTable[id]].Paused_value(instances[idTable[id]].Paused_value() + 1); @@ -188,7 +188,7 @@ class InventoryQuery } else { - if (cJSON_GetObjectItem(state, "ExitCode")->valueint) + if (cJSON_GetObjectItem(state, "ExitCode") != NULL && cJSON_GetObjectItem(state, "ExitCode")->valueint) { // Container exited nonzero instances[idTable[id]].Failed_value(instances[idTable[id]].Failed_value() + 1); @@ -206,7 +206,10 @@ class InventoryQuery } else { - syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + if (cJSON_GetObjectItem(entry, "Id") != NULL) + { + syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + } } } catch (std::exception &e) @@ -239,7 +242,7 @@ class InventoryQuery { cJSON* entry = cJSON_GetArrayItem(response[0], i); - if (entry) + if (entry != NULL) { cJSON* objItem = cJSON_GetObjectItem(entry, "Id"); if (objItem != NULL) @@ -321,7 +324,7 @@ class InventoryQuery { cJSON* entry = cJSON_GetArrayItem(response[0], i); - if (entry) + if (entry != NULL) { // New inventory entry Container_ImageInventory_Class instance; diff --git a/source/code/providers/Container_Process_Class_Provider.cpp b/source/code/providers/Container_Process_Class_Provider.cpp index 76b15bdfc..9adc4edcd 100644 --- a/source/code/providers/Container_Process_Class_Provider.cpp +++ b/source/code/providers/Container_Process_Class_Provider.cpp @@ -55,7 +55,7 @@ class ContainerProcessQuery for (int i = 0; i < cJSON_GetArraySize(dockerPsResponse[0]); i++) { cJSON* containerEntry = cJSON_GetArrayItem(dockerPsResponse[0], i); - if (containerEntry) + if (containerEntry != NULL) { cJSON* objItem = cJSON_GetObjectItem(containerEntry, "Id"); if (objItem != NULL) From 4b630215824d85d568fd384b1bbee071996bec1a Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 27 Sep 2018 16:10:59 -0700 Subject: [PATCH 012/160] Adding a missed null check (#135) --- .../code/providers/Container_DaemonEvent_Class_Provider.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/code/providers/Container_DaemonEvent_Class_Provider.cpp b/source/code/providers/Container_DaemonEvent_Class_Provider.cpp index bf2ab3b53..51e253d73 100644 --- a/source/code/providers/Container_DaemonEvent_Class_Provider.cpp +++ b/source/code/providers/Container_DaemonEvent_Class_Provider.cpp @@ -289,7 +289,10 @@ class EventQuery else { // Image event - instance.ElementName_value(cJSON_GetObjectItem(entry, "id")->valuestring); + if (cJSON_GetObjectItem(entry, "id") != NULL) + { + instance.ElementName_value(cJSON_GetObjectItem(entry, "id")->valuestring); + } instance.Id_value(""); instance.ContainerName_value(""); } From 8b964fd7ee54948b7374ed44f3253d0d89ceb443 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 27 Sep 2018 17:01:04 -0700 Subject: [PATCH 013/160] reusing some variables (#136) --- ...iner_ContainerInventory_Class_Provider.cpp | 26 ++++++++++++------- .../Container_DaemonEvent_Class_Provider.cpp | 5 ++-- ...ontainer_ImageInventory_Class_Provider.cpp | 16 +++++++----- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/source/code/providers/Container_ContainerInventory_Class_Provider.cpp b/source/code/providers/Container_ContainerInventory_Class_Provider.cpp index 68c13053a..ded8fb869 100644 --- a/source/code/providers/Container_ContainerInventory_Class_Provider.cpp +++ b/source/code/providers/Container_ContainerInventory_Class_Provider.cpp @@ -210,7 +210,11 @@ class ContainerQuery correctedstring = stringToTruncate + "\"]"; } instance.EnvironmentVar_value(correctedstring.c_str()); - syslog(LOG_WARNING, "Environment variable truncated for container %s", cJSON_GetObjectItem(entry, "Id")->valuestring); + cJSON* idItem = cJSON_GetObjectItem(entry, "Id"); + if (idItem != NULL) + { + syslog(LOG_WARNING, "Environment variable truncated for container %s", idItem->valuestring); + } } else { instance.EnvironmentVar_value(strcmp(env, "null") ? env : ""); @@ -244,9 +248,10 @@ class ContainerQuery } else { - if (cJSON_GetObjectItem(entry, "Id") != NULL) + cJSON* idItem = cJSON_GetObjectItem(entry, "Id"); + if (idItem != NULL) { - syslog(LOG_WARNING, "Attempt in ObtainContainerConfig to get container %s config information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + syslog(LOG_WARNING, "Attempt in ObtainContainerConfig to get container %s config information returned null", idItem->valuestring); } } } @@ -281,9 +286,10 @@ class ContainerQuery if (exitCode < 0) { exitCode = 128; - if (cJSON_GetObjectItem(entry, "Id") != NULL) + cJSON* idItem = cJSON_GetObjectItem(entry, "Id"); + if (idItem != NULL) { - syslog(LOG_NOTICE, "Container %s returned negative exit code", cJSON_GetObjectItem(entry, "Id")->valuestring); + syslog(LOG_NOTICE, "Container %s returned negative exit code", idItem->valuestring); } } @@ -334,9 +340,10 @@ class ContainerQuery } else { - if (cJSON_GetObjectItem(entry, "Id")) + cJSON* idItem = cJSON_GetObjectItem(entry, "Id"); + if (idItem) { - syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", idItem->valuestring); } } } @@ -381,9 +388,10 @@ class ContainerQuery } else { - if (cJSON_GetObjectItem(entry, "Id")) + cJSON* idItem = cJSON_GetObjectItem(entry, "Id"); + if (idItem != NULL) { - syslog(LOG_WARNING, "Attempt in ObtainContainerHostConfig to get container %s host config information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + syslog(LOG_WARNING, "Attempt in ObtainContainerHostConfig to get container %s host config information returned null", idItem->valuestring); } } } diff --git a/source/code/providers/Container_DaemonEvent_Class_Provider.cpp b/source/code/providers/Container_DaemonEvent_Class_Provider.cpp index 51e253d73..0c28e4769 100644 --- a/source/code/providers/Container_DaemonEvent_Class_Provider.cpp +++ b/source/code/providers/Container_DaemonEvent_Class_Provider.cpp @@ -288,10 +288,11 @@ class EventQuery } else { + cJSON* idItem = cJSON_GetObjectItem(entry, "id"); // Image event - if (cJSON_GetObjectItem(entry, "id") != NULL) + if (idItem != NULL) { - instance.ElementName_value(cJSON_GetObjectItem(entry, "id")->valuestring); + instance.ElementName_value(idItem->valuestring); } instance.Id_value(""); instance.ContainerName_value(""); diff --git a/source/code/providers/Container_ImageInventory_Class_Provider.cpp b/source/code/providers/Container_ImageInventory_Class_Provider.cpp index 01d1c639c..f5742ef5f 100644 --- a/source/code/providers/Container_ImageInventory_Class_Provider.cpp +++ b/source/code/providers/Container_ImageInventory_Class_Provider.cpp @@ -173,10 +173,12 @@ class InventoryQuery { string id = string(objItem->valuestring); - if (cJSON_GetObjectItem(state, "Running") != NULL && cJSON_GetObjectItem(state, "Running")->valueint) + cJSON* runningItem = cJSON_GetObjectItem(state, "Running"); + if (runningItem != NULL && runningItem->valueint) { // Running container - if (cJSON_GetObjectItem(state, "Paused") != NULL && cJSON_GetObjectItem(state, "Paused")->valueint) + cJSON* pausedItem = cJSON_GetObjectItem(state, "Paused"); + if (pausedItem != NULL && pausedItem->valueint) { // Paused container instances[idTable[id]].Paused_value(instances[idTable[id]].Paused_value() + 1); @@ -188,7 +190,8 @@ class InventoryQuery } else { - if (cJSON_GetObjectItem(state, "ExitCode") != NULL && cJSON_GetObjectItem(state, "ExitCode")->valueint) + cJSON* exitCodeItem = cJSON_GetObjectItem(state, "ExitCode"); + if (exitCodeItem != NULL && exitCodeItem->valueint) { // Container exited nonzero instances[idTable[id]].Failed_value(instances[idTable[id]].Failed_value() + 1); @@ -206,9 +209,10 @@ class InventoryQuery } else { - if (cJSON_GetObjectItem(entry, "Id") != NULL) + cJSON* idItem = cJSON_GetObjectItem(entry, "Id"); + if (idItem != NULL) { - syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + syslog(LOG_WARNING, "Attempt in ObtainContainerState to get container %s state information returned null", idItem->valuestring); } } } @@ -263,7 +267,7 @@ class InventoryQuery } else { - syslog(LOG_WARNING, "API call in AggregateContainerStatus to inspect container %s returned null", cJSON_GetObjectItem(entry, "Id")->valuestring); + syslog(LOG_WARNING, "API call in AggregateContainerStatus to inspect container %s returned null", objItem->valuestring); } } } From 938c2edc0d84917c123c2947c791fa3806fce25c Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 28 Sep 2018 16:00:29 -0700 Subject: [PATCH 014/160] Rashmi/cjson delete null check (#138) * adding null check for cjson-delete * null chk * removing null check --- source/code/providers/Container_Process_Class_Provider.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/code/providers/Container_Process_Class_Provider.cpp b/source/code/providers/Container_Process_Class_Provider.cpp index 9adc4edcd..e27df1788 100644 --- a/source/code/providers/Container_Process_Class_Provider.cpp +++ b/source/code/providers/Container_Process_Class_Provider.cpp @@ -163,7 +163,10 @@ class ContainerProcessQuery } } } - cJSON_Delete(dockerPsResponse[0]); + if (!dockerPsResponse.empty() && dockerPsResponse[0]) + { + cJSON_Delete(dockerPsResponse[0]); + } } catch (std::exception &e) { From fbfdf11e98cebbbc623bd845bf3010b46dd3918b Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 2 Oct 2018 17:33:22 -0700 Subject: [PATCH 015/160] updating log level to debug for some provider workflows (#139) --- installer/conf/container.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 9eaed9b47..a41b963a9 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -111,7 +111,7 @@ type out_oms - log_level info + log_level debug buffer_chunk_limit 20m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer @@ -124,7 +124,7 @@ type out_oms - log_level info + log_level debug buffer_chunk_limit 20m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_imageinventory*.buffer @@ -137,7 +137,7 @@ type out_oms - log_level info + log_level debug buffer_chunk_limit 20m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_servicelog*.buffer From d4260663ccaeae093911052ab47bb2f644f3e56c Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 4 Oct 2018 14:01:11 -0700 Subject: [PATCH 016/160] Fixing CPU Utilization and removing Fluent-bit filters (#140) Removing fluent-bit filters, CPU optimizations --- installer/conf/td-agent-bit.conf | 20 ++---------- source/code/go/src/plugins/oms.go | 47 ++++++++++++++++----------- source/code/go/src/plugins/out_oms.go | 2 +- 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 84a9fcf94..27916eafd 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -12,23 +12,9 @@ Parser docker Mem_Buf_Limit 30m Path_Key filepath - -[FILTER] - Name record_modifier - Match oms.container.log.* - Whitelist_key log - Whitelist_key stream - Whitelist_key time - Whitelist_key filepath - -[FILTER] - Name modify - Match oms.container.log.* - Rename log LogEntry - Rename stream LogEntrySource - Rename time LogEntryTimeStamp - Rename filepath Filepath - Add_if_not_present SourceSystem Containers + Buffer_Chunk_Size 1m + Buffer_Max_Size 1m + Skip_Long_Lines On [OUTPUT] Name oms diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 2e9e2f3d0..c7fe8eb42 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -12,11 +12,11 @@ import ( "strings" "sync" "time" -) -import ( + "github.com/fluent/fluent-bit-go/output" - "github.com/mitchellh/mapstructure" + lumberjack "gopkg.in/natefinch/lumberjack.v2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -81,7 +81,6 @@ type DataItem struct { Name string `json:"Name"` SourceSystem string `json:"SourceSystem"` Computer string `json:"Computer"` - Filepath string `json:"Filepath"` } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point @@ -199,23 +198,18 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { for _, record := range tailPluginRecords { - filepath := toString(record["Filepath"]) - containerID := getContainerIDFromFilePath(filepath) + containerID := GetContainerIDFromFilePath(toString(record["filepath"])) if containerID == "" || containsKey(IgnoreIDSet, containerID) { continue } - var dataItem DataItem stringMap := make(map[string]string) - // convert map[interface{}]interface{} to map[string]string - for key, value := range record { - strKey := fmt.Sprintf("%v", key) - strValue := toString(value) - stringMap[strKey] = strValue - } - + stringMap["LogEntry"] = toString(record["log"]) + stringMap["LogEntrySource"] = toString(record["stream"]) + stringMap["LogEntryTimeStamp"] = toString(record["time"]) + stringMap["SourceSystem"] = "Containers" stringMap["Id"] = containerID if val, ok := ImageIDMap[containerID]; ok { @@ -238,8 +232,17 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } } - stringMap["Computer"] = Computer - mapstructure.Decode(stringMap, &dataItem) + dataItem := DataItem{ + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + SourceSystem: stringMap["SourceSystem"], + Computer: Computer, + Image: stringMap["Image"], + Name: stringMap["Name"], + } + dataItems = append(dataItems, dataItem) } @@ -281,11 +284,17 @@ func containsKey(currentMap map[string]bool, key string) bool { } func toString(s interface{}) string { - value := s.([]uint8) - return string([]byte(value[:])) + switch t := s.(type) { + case []byte: + // prevent encoding to base64 + return string(t) + default: + return "" + } } -func getContainerIDFromFilePath(filepath string) string { +// GetContainerIDFromFilePath Gets the container ID From the file Path +func GetContainerIDFromFilePath(filepath string) string { start := strings.LastIndex(filepath, "-") end := strings.LastIndex(filepath, ".") if start >= end || start == -1 || end == -1 { diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index ec9a573d1..0efc1242d 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -10,7 +10,7 @@ import ( //export FLBPluginRegister func FLBPluginRegister(ctx unsafe.Pointer) int { - return output.FLBPluginRegister(ctx, "oms", "Stdout GO!") + return output.FLBPluginRegister(ctx, "oms", "OMS GO!") } //export FLBPluginInit From c2cabab7199870af23bb90de10bca4d8eb50e847 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 9 Oct 2018 14:50:10 -0700 Subject: [PATCH 017/160] Minor tweaks 1. Remove some logging 2. Added more Error Handling 3. Continue when there is an error with k8s api (#141) * Removing some logs, added more error checking, continue on kube-api error * Return FLB OK for json Marshall error, instead of RETRY --- source/code/go/src/plugins/oms.go | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index c7fe8eb42..d20f11d57 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -133,6 +133,7 @@ func updateContainerImageNameMaps() { pods, err := ClientSet.CoreV1().Pods("").List(metav1.ListOptions{}) if err != nil { Log("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + continue } for _, pod := range pods.Items { @@ -216,20 +217,12 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap["Image"] = val } else { Log("ContainerId %s not present in Map ", containerID) - Log("CurrentMap Snapshot \n") - for k, v := range ImageIDMap { - Log("%s ==> %s", k, v) - } } if val, ok := NameIDMap[containerID]; ok { stringMap["Name"] = val } else { Log("ContainerId %s not present in Map ", containerID) - Log("CurrentMap Snapshot \n") - for k, v := range NameIDMap { - Log("%s ==> %s", k, v) - } } dataItem := DataItem{ @@ -253,6 +246,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataItems: dataItems} marshalled, err := json.Marshal(logEntry) + if err != nil { + Log("Error while Marshalling log Entry: %s", err.Error()) + return output.FLB_OK + } req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) req.Header.Set("Content-Type", "application/json") From 32567db6965f65154663c0204c1a3e2a599530d0 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Wed, 10 Oct 2018 14:09:04 -0700 Subject: [PATCH 018/160] * Change FluentBit flush interval to 30 secs (from 5 secs) * Remove ContainerPerf, ContainerServiceLog,ContainerProcess (OMI workflows) for Daemonset --- installer/conf/container.conf | 33 -------------------------------- installer/conf/td-agent-bit.conf | 2 +- 2 files changed, 1 insertion(+), 34 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index a41b963a9..1916300cb 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -7,19 +7,6 @@ bind 127.0.0.1 -# Filter container logs - - type filter_docker_log - log_path "/var/opt/microsoft/omsagent/log/filter_docker_log.txt" - - -# Container perf - - type oms_omi - object_name "Container" - interval 30s - - # Container inventory type omi @@ -40,16 +27,6 @@ ] -# Container service log - - type omi - run_interval 60s - tag oms.container.servicelog - items [ - ["root/cimv2","Container_DaemonEvent"] - ] - - # Container host inventory type omi @@ -60,16 +37,6 @@ ] -# Container processes - - type omi - run_interval 60s - tag oms.api.ContainerProcess - items [ - ["root/cimv2","Container_Process"] - ] - - #cadvisor perf type cadvisorperf diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 27916eafd..b5d2309e1 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -1,5 +1,5 @@ [SERVICE] - Flush 5 + Flush 30 Log_Level info Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log From afc981d504c3f44fd3232892e4823d5d09503d14 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 11 Oct 2018 21:37:09 -0700 Subject: [PATCH 019/160] Container Log Telemetry --- .gitignore | 3 + installer/conf/td-agent-bit.conf | 7 +- source/code/go/src/plugins/glide.lock | 10 +- source/code/go/src/plugins/glide.yaml | 8 +- source/code/go/src/plugins/oms.go | 9 +- source/code/go/src/plugins/out_oms.go | 10 ++ source/code/go/src/plugins/telemetry.go | 151 ++++++++++++++++++++++++ 7 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 source/code/go/src/plugins/telemetry.go diff --git a/.gitignore b/.gitignore index 92c8c0cf2..e58d69f7b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ /test/code/providers/TestScriptPath.h /test/code/providers/providertestutils.cpp +source/code/go/src/plugins/profiling +.vscode/launch.json +source/code/go/src/plugins/vendor/ \ No newline at end of file diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index b5d2309e1..5a1c105bf 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -17,5 +17,8 @@ Skip_Long_Lines On [OUTPUT] - Name oms - Match oms.container.log.* \ No newline at end of file + Name oms + EnableTelemetry true + TelemetryPushInterval 300 + Match oms.container.log.* + AgentVersion internaltest1004-2 \ No newline at end of file diff --git a/source/code/go/src/plugins/glide.lock b/source/code/go/src/plugins/glide.lock index 4597b594a..fc147fe74 100644 --- a/source/code/go/src/plugins/glide.lock +++ b/source/code/go/src/plugins/glide.lock @@ -1,5 +1,5 @@ -hash: bb32415f402ab29751f29b8e394bc974cbc31861453d817aaeb94ef83dacc488 -updated: 2018-09-14T18:14:28.748047598Z +hash: a6a873d09ed9c3d890a70122e61efba992ead9850fe48f6fcb020d86800d4ade +updated: 2018-10-10T13:37:51.9703908-07:00 imports: - name: github.com/fluent/fluent-bit-go version: c4a158a6e3a793166c6ecfa2d5c80d71eada8959 @@ -38,8 +38,10 @@ imports: - diskcache - name: github.com/json-iterator/go version: f2b4162afba35581b6d4a50d3b8f34e33c144682 -- name: github.com/mitchellh/mapstructure - version: fa473d140ef3c6adf42d6b391fe76707f1f243c8 +- name: github.com/Microsoft/ApplicationInsights-Go + version: d2df5d440eda5372f24fcac03839a64d6cb5f7e5 + subpackages: + - appinsights - name: github.com/modern-go/concurrent version: bacd9c7ef1dd9b15be4a9909b8ac7a4e313eec94 - name: github.com/modern-go/reflect2 diff --git a/source/code/go/src/plugins/glide.yaml b/source/code/go/src/plugins/glide.yaml index 403e1efc4..b2829391b 100644 --- a/source/code/go/src/plugins/glide.yaml +++ b/source/code/go/src/plugins/glide.yaml @@ -1,10 +1,8 @@ -package: plugins +package: . import: - package: github.com/fluent/fluent-bit-go subpackages: - output -- package: github.com/mitchellh/mapstructure - version: ^1.0.0 - package: gopkg.in/natefinch/lumberjack.v2 version: ^2.1.0 - package: k8s.io/apimachinery @@ -15,3 +13,7 @@ import: subpackages: - kubernetes - rest +- package: github.com/Microsoft/ApplicationInsights-Go + version: ^0.4.2 + subpackages: + - appinsights diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index d20f11d57..807e00937 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -42,6 +42,8 @@ var ( OMSEndpoint string // Computer (Hostname) when ingesting into ContainerLog table Computer string + // WorkspaceID log analytics workspace id + WorkspaceID string ) var ( @@ -170,6 +172,7 @@ func updateKubeSystemContainerIDs() { pods, err := ClientSet.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) if err != nil { Log("Error getting pods %s\nIt is ok to log here and continue. Kube-system logs will be collected", err.Error()) + continue } _ignoreIDSet := make(map[string]bool) @@ -269,7 +272,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { return output.FLB_RETRY } - Log("Successfully flushed %d records in %s", len(dataItems), elapsed) + numRecords := len(dataItems) + Log("Successfully flushed %d records in %s", numRecords, elapsed) + FlushedRecordsCount += float64(numRecords) + FlushedRecordsTimeTaken += float64(elapsed / time.Millisecond) } return output.FLB_OK @@ -322,6 +328,7 @@ func InitializePlugin(pluginConfPath string) { log.Fatalf("Error Reading omsadmin configuration %s\n", err.Error()) } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] + WorkspaceID = omsadminConf["WORKSPACE_ID"] Log("OMSEndpoint %s", OMSEndpoint) // Initialize image,name map refresh ticker diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 0efc1242d..37c9eb12b 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -5,6 +5,7 @@ import ( ) import ( "C" + "strings" "unsafe" ) @@ -19,6 +20,14 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") InitializePlugin(ContainerLogPluginConfFilePath) + enablePlugin := output.FLBPluginConfigKey(ctx, "EnableTelemetry") + telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushInterval") + agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") + + if strings.Compare(strings.ToLower(enablePlugin), "true") == 0 { + go SendContainerLogFlushRateMetric(telemetryPushInterval, agentVersion) + SendEvent(EventNameContainerLogInit, make(map[string]string)) + } return output.FLB_OK } @@ -48,6 +57,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // FLBPluginExit exits the plugin func FLBPluginExit() int { + defer TelemetryShutdown() KubeSystemContainersRefreshTicker.Stop() ContainerImageNameRefreshTicker.Stop() return output.FLB_OK diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go new file mode 100644 index 000000000..4d4ab2371 --- /dev/null +++ b/source/code/go/src/plugins/telemetry.go @@ -0,0 +1,151 @@ +package main + +import ( + "encoding/base64" + "errors" + "os" + "strconv" + "strings" + "time" + + "github.com/Microsoft/ApplicationInsights-Go/appinsights" +) + +var ( + // FlushedRecordsCount indicates the number of flushed records in the current period + FlushedRecordsCount float64 + // FlushedRecordsTimeTaken indicates the cumulative time taken to flush the records for the current period + FlushedRecordsTimeTaken float64 + // CommonProperties indicates the dimensions that are sent with every event/metric + CommonProperties map[string]string + // TelemetryClient is the client used to send the telemetry + TelemetryClient appinsights.TelemetryClient + // ContainerLogTelemetryTicker sends telemetry periodically + ContainerLogTelemetryTicker *time.Ticker +) + +const ( + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + controllerTypeDaemonSet = "DaemonSet" + controllerTypeReplicaSet = "ReplicaSet" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsInstrumentationKey = "APPLICATIONINSIGHTS_INSTRUMENTATIONKEY" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + defaultTelemetryPushInterval = 300 + + // EventNameContainerLogInit name of the event + EventNameContainerLogInit = "ContainerLogPluginInitialized" +) + +// Initialize initializes the telemetry artifacts +func initialize(telemetryIntervalProperty string, agentVersion string) (int, error) { + + telemetryInterval, err := strconv.Atoi(telemetryIntervalProperty) + if err != nil { + telemetryInterval = defaultTelemetryPushInterval + } + + ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryInterval)) + + encodedIkey := os.Getenv(envAppInsightsInstrumentationKey) + if encodedIkey == "" { + Log("App Insights IKey missing in Environment Variables \n") + return -1, errors.New("Missing App Insights Instrumentation Key Environment Variable") + } + + decIkey, err := base64.StdEncoding.DecodeString(encodedIkey) + if err != nil { + Log("Error Decoding encoded Instrumentation key %s", err.Error()) + return -1, err + } + + TelemetryClient = appinsights.NewTelemetryClient(string(decIkey)) + + CommonProperties = make(map[string]string) + CommonProperties["Computer"] = Computer + CommonProperties["WorkspaceID"] = WorkspaceID + CommonProperties["ControllerType"] = controllerTypeDaemonSet + CommonProperties["AgentVersion"] = agentVersion + + aksResourceID := os.Getenv(envAKSResourceID) + // if the aks resource id is not defined, it is most likely an ACS Cluster + if aksResourceID == "" { + CommonProperties["ACSResourceName"] = os.Getenv(envACSResourceName) + CommonProperties["ClusterType"] = clusterTypeACS + + CommonProperties["SubscriptionID"] = "" + CommonProperties["ResourceGroupName"] = "" + CommonProperties["ClusterName"] = "" + CommonProperties["Region"] = "" + + } else { + CommonProperties["ACSResourceName"] = "" + splitStrings := strings.Split(aksResourceID, "/") + CommonProperties["SubscriptionID"] = splitStrings[2] + CommonProperties["ResourceGroupName"] = splitStrings[4] + CommonProperties["ClusterName"] = splitStrings[8] + CommonProperties["ClusterType"] = clusterTypeAKS + + region := os.Getenv("AKS_REGION") + if region != "" { + CommonProperties["Region"] = region + } + } + + TelemetryClient.Context().CommonProperties = CommonProperties + return 0, nil +} + +// SendContainerLogFlushRateMetric is a go-routine that flushes the data periodically (every 5 mins to App Insights) +func SendContainerLogFlushRateMetric(telemetryIntervalProperty string, agentVersion string) { + + ret, err := initialize(telemetryIntervalProperty, agentVersion) + if ret != 0 || err != nil { + Log("Error During Telemetry Initialization :%s", err.Error()) + return + } + + for ; true; <-ContainerLogTelemetryTicker.C { + flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 + metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) + TelemetryClient.Track(metric) + FlushedRecordsCount = 0.0 + FlushedRecordsTimeTaken = 0.0 + } +} + +// TelemetryShutdown stops the ticker that sends data to App Insights periodically +func TelemetryShutdown() { + Log("Shutting down ContainerLog Telemetry\n") + ContainerLogTelemetryTicker.Stop() +} + +// SendEvent sends an event to App Insights +func SendEvent(eventName string, dimensions map[string]string) { + // this is because the TelemetryClient is initialized in a different goroutine. A simple wait loop here is just waiting for it to be initialized. This will happen only for the init event. Any subsequent Event should work just fine + for TelemetryClient == nil { + Log("Waiting for Telemetry Client to be initialized") + time.Sleep(1 * time.Second) + } + + // take a copy so the CommonProperties can be restored later + _commonProps := make(map[string]string) + for k, v := range TelemetryClient.Context().CommonProperties { + _commonProps[k] = v + } + + // add any extra dimensions + for k, v := range dimensions { + TelemetryClient.Context().CommonProperties[k] = v + } + + Log("Sending Event : %s\n", eventName) + event := appinsights.NewEventTelemetry(eventName) + TelemetryClient.Track(event) + + // restore original CommonProperties + TelemetryClient.Context().CommonProperties = _commonProps +} From 4b958dde94450e96d6d46351756c83500df7935f Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 12 Oct 2018 09:18:10 -0700 Subject: [PATCH 020/160] Fixing an issue with Send Init Event if Telemetry is not initialized properly, tab to whitespace in conf file --- installer/conf/td-agent-bit.conf | 2 +- source/code/go/src/plugins/out_oms.go | 7 ++-- source/code/go/src/plugins/telemetry.go | 44 ++++++++++++++----------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 5a1c105bf..6849a3744 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -19,6 +19,6 @@ [OUTPUT] Name oms EnableTelemetry true - TelemetryPushInterval 300 + TelemetryPushInterval 300 Match oms.container.log.* AgentVersion internaltest1004-2 \ No newline at end of file diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 37c9eb12b..2603368ab 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -20,13 +20,14 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") InitializePlugin(ContainerLogPluginConfFilePath) - enablePlugin := output.FLBPluginConfigKey(ctx, "EnableTelemetry") + enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushInterval") agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") - if strings.Compare(strings.ToLower(enablePlugin), "true") == 0 { + if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { go SendContainerLogFlushRateMetric(telemetryPushInterval, agentVersion) - SendEvent(EventNameContainerLogInit, make(map[string]string)) + } else { + Log("Telemetry is not enabled for the plugin %s \n", output.FLBPluginConfigKey(ctx, "Name")) } return output.FLB_OK } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 4d4ab2371..c2f565a45 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "errors" "os" + "runtime" "strconv" "strings" "time" @@ -25,39 +26,40 @@ var ( ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - controllerTypeDaemonSet = "DaemonSet" - controllerTypeReplicaSet = "ReplicaSet" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsInstrumentationKey = "APPLICATIONINSIGHTS_INSTRUMENTATIONKEY" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - defaultTelemetryPushInterval = 300 + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + controllerTypeDaemonSet = "DaemonSet" + controllerTypeReplicaSet = "ReplicaSet" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + defaultTelemetryPushInterval = 300 // EventNameContainerLogInit name of the event EventNameContainerLogInit = "ContainerLogPluginInitialized" ) // Initialize initializes the telemetry artifacts -func initialize(telemetryIntervalProperty string, agentVersion string) (int, error) { +func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, error) { - telemetryInterval, err := strconv.Atoi(telemetryIntervalProperty) + telemetryPushInterval, err := strconv.Atoi(telemetryPushIntervalProperty) if err != nil { - telemetryInterval = defaultTelemetryPushInterval + Log("Error Converting telemetryPushIntervalProperty %s. Using Default Interval... %d \n", telemetryPushIntervalProperty, defaultTelemetryPushInterval) + telemetryPushInterval = defaultTelemetryPushInterval } - ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryInterval)) + ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryPushInterval)) - encodedIkey := os.Getenv(envAppInsightsInstrumentationKey) + encodedIkey := os.Getenv(envAppInsightsAuth) if encodedIkey == "" { - Log("App Insights IKey missing in Environment Variables \n") - return -1, errors.New("Missing App Insights Instrumentation Key Environment Variable") + Log("Environment Variable Missing \n") + return -1, errors.New("Missing Environment Variable") } decIkey, err := base64.StdEncoding.DecodeString(encodedIkey) if err != nil { - Log("Error Decoding encoded Instrumentation key %s", err.Error()) + Log("Decoding Error %s", err.Error()) return -1, err } @@ -99,14 +101,16 @@ func initialize(telemetryIntervalProperty string, agentVersion string) (int, err } // SendContainerLogFlushRateMetric is a go-routine that flushes the data periodically (every 5 mins to App Insights) -func SendContainerLogFlushRateMetric(telemetryIntervalProperty string, agentVersion string) { +func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agentVersion string) { - ret, err := initialize(telemetryIntervalProperty, agentVersion) + ret, err := initialize(telemetryPushIntervalProperty, agentVersion) if ret != 0 || err != nil { Log("Error During Telemetry Initialization :%s", err.Error()) - return + runtime.Goexit() } + SendEvent(EventNameContainerLogInit, make(map[string]string)) + for ; true; <-ContainerLogTelemetryTicker.C { flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) From 510ef9f95b8e5de04e7b5952e24458374d6cbf6b Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 12 Oct 2018 10:45:14 -0700 Subject: [PATCH 021/160] PR feedback --- installer/conf/td-agent-bit.conf | 10 ++++----- source/code/go/src/plugins/out_oms.go | 8 +++---- source/code/go/src/plugins/telemetry.go | 30 +++++++++++-------------- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 6849a3744..b01b3a352 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -17,8 +17,8 @@ Skip_Long_Lines On [OUTPUT] - Name oms - EnableTelemetry true - TelemetryPushInterval 300 - Match oms.container.log.* - AgentVersion internaltest1004-2 \ No newline at end of file + Name oms + EnableTelemetry true + TelemetryPushIntervalSeconds 300 + Match oms.container.log.* + AgentVersion internaltest1004-2 \ No newline at end of file diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 2603368ab..732ae5216 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -21,13 +21,13 @@ func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") InitializePlugin(ContainerLogPluginConfFilePath) enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") - telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushInterval") - agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") - if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { + telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushIntervalSeconds") + agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") go SendContainerLogFlushRateMetric(telemetryPushInterval, agentVersion) } else { Log("Telemetry is not enabled for the plugin %s \n", output.FLBPluginConfigKey(ctx, "Name")) + return output.FLB_OK } return output.FLB_OK } @@ -58,7 +58,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // FLBPluginExit exits the plugin func FLBPluginExit() int { - defer TelemetryShutdown() + ContainerLogTelemetryTicker.Stop() KubeSystemContainersRefreshTicker.Stop() ContainerImageNameRefreshTicker.Stop() return output.FLB_OK diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index c2f565a45..4396ea655 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -26,15 +26,15 @@ var ( ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - controllerTypeDaemonSet = "DaemonSet" - controllerTypeReplicaSet = "ReplicaSet" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - defaultTelemetryPushInterval = 300 + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + controllerTypeDaemonSet = "DaemonSet" + controllerTypeReplicaSet = "ReplicaSet" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + defaultTelemetryPushIntervalSeconds = 300 // EventNameContainerLogInit name of the event EventNameContainerLogInit = "ContainerLogPluginInitialized" @@ -45,8 +45,8 @@ func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, telemetryPushInterval, err := strconv.Atoi(telemetryPushIntervalProperty) if err != nil { - Log("Error Converting telemetryPushIntervalProperty %s. Using Default Interval... %d \n", telemetryPushIntervalProperty, defaultTelemetryPushInterval) - telemetryPushInterval = defaultTelemetryPushInterval + Log("Error Converting telemetryPushIntervalProperty %s. Using Default Interval... %d \n", telemetryPushIntervalProperty, defaultTelemetryPushIntervalSeconds) + telemetryPushInterval = defaultTelemetryPushIntervalSeconds } ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryPushInterval)) @@ -116,17 +116,13 @@ func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agent metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) TelemetryClient.Track(metric) + DataUpdateMutex.Lock() FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 + DataUpdateMutex.Unlock() } } -// TelemetryShutdown stops the ticker that sends data to App Insights periodically -func TelemetryShutdown() { - Log("Shutting down ContainerLog Telemetry\n") - ContainerLogTelemetryTicker.Stop() -} - // SendEvent sends an event to App Insights func SendEvent(eventName string, dimensions map[string]string) { // this is because the TelemetryClient is initialized in a different goroutine. A simple wait loop here is just waiting for it to be initialized. This will happen only for the init event. Any subsequent Event should work just fine From 684c39b63581fab69595885ec2c98942098be4f6 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 12 Oct 2018 15:44:25 -0700 Subject: [PATCH 022/160] PR feedback --- source/code/go/src/plugins/telemetry.go | 42 +++++++++---------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 4396ea655..621d88eec 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -81,19 +81,21 @@ func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, CommonProperties["ResourceGroupName"] = "" CommonProperties["ClusterName"] = "" CommonProperties["Region"] = "" + CommonProperties["AKS_RESOURCE_ID"] = "" } else { CommonProperties["ACSResourceName"] = "" + CommonProperties["AKS_RESOURCE_ID"] = aksResourceID splitStrings := strings.Split(aksResourceID, "/") - CommonProperties["SubscriptionID"] = splitStrings[2] - CommonProperties["ResourceGroupName"] = splitStrings[4] - CommonProperties["ClusterName"] = splitStrings[8] + if len(aksResourceID) > 0 && len(aksResourceID) < 10 { + CommonProperties["SubscriptionID"] = splitStrings[2] + CommonProperties["ResourceGroupName"] = splitStrings[4] + CommonProperties["ClusterName"] = splitStrings[8] + } CommonProperties["ClusterType"] = clusterTypeAKS region := os.Getenv("AKS_REGION") - if region != "" { - CommonProperties["Region"] = region - } + CommonProperties["Region"] = region } TelemetryClient.Context().CommonProperties = CommonProperties @@ -112,40 +114,26 @@ func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agent SendEvent(EventNameContainerLogInit, make(map[string]string)) for ; true; <-ContainerLogTelemetryTicker.C { + DataUpdateMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 - metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) - TelemetryClient.Track(metric) - DataUpdateMutex.Lock() FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 DataUpdateMutex.Unlock() + metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(metric) } } // SendEvent sends an event to App Insights func SendEvent(eventName string, dimensions map[string]string) { - // this is because the TelemetryClient is initialized in a different goroutine. A simple wait loop here is just waiting for it to be initialized. This will happen only for the init event. Any subsequent Event should work just fine - for TelemetryClient == nil { - Log("Waiting for Telemetry Client to be initialized") - time.Sleep(1 * time.Second) - } - - // take a copy so the CommonProperties can be restored later - _commonProps := make(map[string]string) - for k, v := range TelemetryClient.Context().CommonProperties { - _commonProps[k] = v - } + Log("Sending Event : %s\n", eventName) + event := appinsights.NewEventTelemetry(eventName) - // add any extra dimensions + // add any extra Properties for k, v := range dimensions { - TelemetryClient.Context().CommonProperties[k] = v + event.Properties[k] = v } - Log("Sending Event : %s\n", eventName) - event := appinsights.NewEventTelemetry(eventName) TelemetryClient.Track(event) - - // restore original CommonProperties - TelemetryClient.Context().CommonProperties = _commonProps } From e165275bb8c346051cf851fb36dbb91ad7cf8afc Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 15 Oct 2018 15:14:41 -0700 Subject: [PATCH 023/160] Sending an event every 5 mins(Heartbeat) (#146) --- installer/conf/td-agent-bit.conf | 2 -- source/code/go/src/plugins/telemetry.go | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index b01b3a352..2553f405f 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -12,8 +12,6 @@ Parser docker Mem_Buf_Limit 30m Path_Key filepath - Buffer_Chunk_Size 1m - Buffer_Max_Size 1m Skip_Long_Lines On [OUTPUT] diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 621d88eec..b1bc4439b 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -36,8 +36,8 @@ const ( metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" defaultTelemetryPushIntervalSeconds = 300 - // EventNameContainerLogInit name of the event - EventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" ) // Initialize initializes the telemetry artifacts @@ -111,9 +111,10 @@ func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agent runtime.Goexit() } - SendEvent(EventNameContainerLogInit, make(map[string]string)) + SendEvent(eventNameContainerLogInit, make(map[string]string)) for ; true; <-ContainerLogTelemetryTicker.C { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) DataUpdateMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) From cfe1ca94c259c533a938834a54f1279e703d7e4b Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 16 Oct 2018 13:03:30 -0700 Subject: [PATCH 024/160] PR feedback to cleanup removed workflows --- installer/conf/container.conf | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 1916300cb..17317871c 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -46,7 +46,7 @@ # Filter for correct format to endpoint - + type filter_container @@ -63,19 +63,6 @@ max_retry_wait 9m - - type out_oms_api - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containerprocess*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - type out_oms log_level debug @@ -102,19 +89,6 @@ max_retry_wait 9m - - type out_oms - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_servicelog*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - type out_oms log_level debug From 892b51c6b166cf10424bf5b6768633f44aa4cfa7 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 16 Oct 2018 13:04:55 -0700 Subject: [PATCH 025/160] updating agent version for telemetry --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 2553f405f..667f2edc2 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -19,4 +19,4 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion internaltest1004-2 \ No newline at end of file + AgentVersion ciprod10162018 From 9c83160dfa92a4f9ae1ab2b010678148aab4fc4d Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 16 Oct 2018 19:33:43 -0700 Subject: [PATCH 026/160] updating agent version --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 667f2edc2..b39587a97 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -19,4 +19,4 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod10162018 + AgentVersion ciprod10162018-2 From f0b5a61ea7597d8044f0ef3347f3258996c97c39 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 25 Oct 2018 11:17:39 -0700 Subject: [PATCH 027/160] Telemetry Updates (#149) * Telemetry Fixes 1. Added Log Generation Rate 2. Fixed parsing bugs 3. Added code to send Exceptions/errors * PR Feedback --- source/code/go/src/plugins/oms.go | 78 +++++++++++++++++++------ source/code/go/src/plugins/out_oms.go | 3 - source/code/go/src/plugins/telemetry.go | 29 ++++++--- source/code/go/src/plugins/utils.go | 8 ++- 4 files changed, 88 insertions(+), 30 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 807e00937..665c3f9f2 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -55,15 +55,18 @@ var ( IgnoreIDSet map[string]bool // DataUpdateMutex read and write mutex access to the container id set DataUpdateMutex = &sync.Mutex{} + // ContainerLogTelemetryMutex read and write mutex access to the Container Log Telemetry + ContainerLogTelemetryMutex = &sync.Mutex{} + // ClientSet for querying KubeAPIs ClientSet *kubernetes.Clientset ) var ( // KubeSystemContainersRefreshTicker updates the kube-system containers - KubeSystemContainersRefreshTicker = time.NewTicker(time.Second * 300) + KubeSystemContainersRefreshTicker *time.Ticker // ContainerImageNameRefreshTicker updates the container image and names periodically - ContainerImageNameRefreshTicker = time.NewTicker(time.Second * 60) + ContainerImageNameRefreshTicker *time.Ticker ) var ( @@ -99,6 +102,7 @@ func createLogger() *log.Logger { fmt.Printf("File Exists. Opening file in append mode...\n") logfile, err = os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0600) if err != nil { + SendException(err.Error()) fmt.Printf(err.Error()) } } @@ -107,6 +111,7 @@ func createLogger() *log.Logger { fmt.Printf("File Doesnt Exist. Creating file...\n") logfile, err = os.Create(path) if err != nil { + SendException(err.Error()) fmt.Printf(err.Error()) } } @@ -134,7 +139,9 @@ func updateContainerImageNameMaps() { pods, err := ClientSet.CoreV1().Pods("").List(metav1.ListOptions{}) if err != nil { - Log("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + Log(message) + SendException(message) continue } @@ -171,7 +178,9 @@ func updateKubeSystemContainerIDs() { pods, err := ClientSet.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) if err != nil { - Log("Error getting pods %s\nIt is ok to log here and continue. Kube-system logs will be collected", err.Error()) + message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue. Kube-system logs will be collected", err.Error()) + SendException(message) + Log(message) continue } @@ -194,17 +203,29 @@ func updateKubeSystemContainerIDs() { // PostDataHelper sends data to the OMS endpoint func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { - defer DataUpdateMutex.Unlock() - start := time.Now() var dataItems []DataItem + ignoreIDSet := make(map[string]bool) + imageIDMap := make(map[string]string) + nameIDMap := make(map[string]string) + DataUpdateMutex.Lock() + for k, v := range IgnoreIDSet { + ignoreIDSet[k] = v + } + for k, v := range ImageIDMap { + imageIDMap[k] = v + } + for k, v := range NameIDMap { + nameIDMap[k] = v + } + DataUpdateMutex.Unlock() for _, record := range tailPluginRecords { containerID := GetContainerIDFromFilePath(toString(record["filepath"])) - if containerID == "" || containsKey(IgnoreIDSet, containerID) { + if containerID == "" || containsKey(ignoreIDSet, containerID) { continue } @@ -216,13 +237,13 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap["SourceSystem"] = "Containers" stringMap["Id"] = containerID - if val, ok := ImageIDMap[containerID]; ok { + if val, ok := imageIDMap[containerID]; ok { stringMap["Image"] = val } else { Log("ContainerId %s not present in Map ", containerID) } - if val, ok := NameIDMap[containerID]; ok { + if val, ok := nameIDMap[containerID]; ok { stringMap["Name"] = val } else { Log("ContainerId %s not present in Map ", containerID) @@ -250,7 +271,9 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { marshalled, err := json.Marshal(logEntry) if err != nil { - Log("Error while Marshalling log Entry: %s", err.Error()) + message := fmt.Sprintf("Error while Marshalling log Entry: %s", err.Error()) + Log(message) + SendException(message) return output.FLB_OK } req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) @@ -260,8 +283,11 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { elapsed := time.Since(start) if err != nil { - Log("Error when sending request %s \n", err.Error()) + message := fmt.Sprintf("Error when sending request %s \n", err.Error()) + Log(message) + SendException(message) Log("Failed to flush %d records after %s", len(dataItems), elapsed) + return output.FLB_RETRY } @@ -274,8 +300,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numRecords := len(dataItems) Log("Successfully flushed %d records in %s", numRecords, elapsed) + ContainerLogTelemetryMutex.Lock() FlushedRecordsCount += float64(numRecords) FlushedRecordsTimeTaken += float64(elapsed / time.Millisecond) + ContainerLogTelemetryMutex.Unlock() } return output.FLB_OK @@ -318,13 +346,17 @@ func InitializePlugin(pluginConfPath string) { pluginConfig, err := ReadConfiguration(pluginConfPath) if err != nil { - Log("Error Reading plugin config path : %s \n", err.Error()) - log.Fatalf("Error Reading plugin config path : %s \n", err.Error()) + message := fmt.Sprintf("Error Reading plugin config path : %s \n", err.Error()) + Log(message) + SendException(message) + time.Sleep(30 * time.Second) + log.Fatalln(message) } omsadminConf, err := ReadConfiguration(pluginConfig["omsadmin_conf_path"]) if err != nil { Log(err.Error()) + SendException(err.Error()) log.Fatalf("Error Reading omsadmin configuration %s\n", err.Error()) } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] @@ -334,7 +366,9 @@ func InitializePlugin(pluginConfPath string) { // Initialize image,name map refresh ticker containerInventoryRefreshInterval, err := strconv.Atoi(pluginConfig["container_inventory_refresh_interval"]) if err != nil { - Log("Error Reading Container Inventory Refresh Interval %s", err.Error()) + message := fmt.Sprintf("Error Reading Container Inventory Refresh Interval %s", err.Error()) + Log(message) + SendException(message) Log("Using Default Refresh Interval of %d s\n", defaultContainerInventoryRefreshInterval) containerInventoryRefreshInterval = defaultContainerInventoryRefreshInterval } @@ -344,7 +378,9 @@ func InitializePlugin(pluginConfPath string) { // Initialize Kube System Refresh Ticker kubeSystemContainersRefreshInterval, err := strconv.Atoi(pluginConfig["kube_system_containers_refresh_interval"]) if err != nil { - Log("Error Reading Kube System Container Ids Refresh Interval %s", err.Error()) + message := fmt.Sprintf("Error Reading Kube System Container Ids Refresh Interval %s", err.Error()) + Log(message) + SendException(message) Log("Using Default Refresh Interval of %d s\n", defaultKubeSystemContainersRefreshInterval) kubeSystemContainersRefreshInterval = defaultKubeSystemContainersRefreshInterval } @@ -356,7 +392,9 @@ func InitializePlugin(pluginConfPath string) { if err != nil { // It is ok to log here and continue, because only the Computer column will be missing, // which can be deduced from a combination of containerId, and docker logs on the node - Log("Error when reading containerHostName file %s.\n It is ok to log here and continue, because only the Computer column will be missing, which can be deduced from a combination of containerId, and docker logs on the nodes\n", err.Error()) + message := fmt.Sprintf("Error when reading containerHostName file %s.\n It is ok to log here and continue, because only the Computer column will be missing, which can be deduced from a combination of containerId, and docker logs on the nodes\n", err.Error()) + Log(message) + SendException(message) } Computer = strings.TrimSuffix(toString(containerHostName), "\n") Log("Computer == %s \n", Computer) @@ -364,12 +402,16 @@ func InitializePlugin(pluginConfPath string) { // Initialize KubeAPI Client config, err := rest.InClusterConfig() if err != nil { - Log("Error getting config %s.\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + message := fmt.Sprintf("Error getting config %s.\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + Log(message) + SendException(message) } ClientSet, err = kubernetes.NewForConfig(config) if err != nil { - Log("Error getting clientset %s.\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + message := fmt.Sprintf("Error getting clientset %s.\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) + SendException(message) + Log(message) } PluginConfiguration = pluginConfig diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 732ae5216..e2ee324e7 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -34,7 +34,6 @@ func FLBPluginInit(ctx unsafe.Pointer) int { //export FLBPluginFlush func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { - var count int var ret int var record map[interface{}]interface{} var records []map[interface{}]interface{} @@ -43,7 +42,6 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { dec := output.NewDecoder(data, int(length)) // Iterate Records - count = 0 for { // Extract Record ret, _, record = output.GetRecord(dec) @@ -51,7 +49,6 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { break } records = append(records, record) - count++ } return PostDataHelper(records) } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index b1bc4439b..72454948d 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -34,13 +34,14 @@ const ( envACSResourceName = "ACS_RESOURCE_NAME" envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" defaultTelemetryPushIntervalSeconds = 300 eventNameContainerLogInit = "ContainerLogPluginInitialized" eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" ) -// Initialize initializes the telemetry artifacts +// initialize initializes the telemetry artifacts func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, error) { telemetryPushInterval, err := strconv.Atoi(telemetryPushIntervalProperty) @@ -87,7 +88,7 @@ func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, CommonProperties["ACSResourceName"] = "" CommonProperties["AKS_RESOURCE_ID"] = aksResourceID splitStrings := strings.Split(aksResourceID, "/") - if len(aksResourceID) > 0 && len(aksResourceID) < 10 { + if len(splitStrings) > 0 && len(splitStrings) < 10 { CommonProperties["SubscriptionID"] = splitStrings[2] CommonProperties["ResourceGroupName"] = splitStrings[4] CommonProperties["ClusterName"] = splitStrings[8] @@ -110,19 +111,24 @@ func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agent Log("Error During Telemetry Initialization :%s", err.Error()) runtime.Goexit() } - + start := time.Now() SendEvent(eventNameContainerLogInit, make(map[string]string)) for ; true; <-ContainerLogTelemetryTicker.C { SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) - DataUpdateMutex.Lock() + elapsed := time.Since(start) + ContainerLogTelemetryMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 - Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) + logRate := FlushedRecordsCount / float64(elapsed/time.Second) FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 - DataUpdateMutex.Unlock() - metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(metric) + ContainerLogTelemetryMutex.Unlock() + + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + TelemetryClient.Track(logRateMetric) + start = time.Now() } } @@ -138,3 +144,10 @@ func SendEvent(eventName string, dimensions map[string]string) { TelemetryClient.Track(event) } + +// SendException send an event to the configured app insights instance +func SendException(err interface{}) { + if TelemetryClient != nil { + TelemetryClient.TrackException(err) + } +} diff --git a/source/code/go/src/plugins/utils.go b/source/code/go/src/plugins/utils.go index 1ac9b05a9..94db033bd 100644 --- a/source/code/go/src/plugins/utils.go +++ b/source/code/go/src/plugins/utils.go @@ -3,6 +3,7 @@ package main import ( "bufio" "crypto/tls" + "fmt" "log" "net/http" "os" @@ -19,7 +20,9 @@ func ReadConfiguration(filename string) (map[string]string, error) { file, err := os.Open(filename) if err != nil { + SendException(err) log.Fatal(err) + return nil, err } defer file.Close() @@ -39,6 +42,7 @@ func ReadConfiguration(filename string) (map[string]string, error) { } if err := scanner.Err(); err != nil { + SendException(err) log.Fatal(err) return nil, err } @@ -51,7 +55,9 @@ func CreateHTTPClient() { cert, err := tls.LoadX509KeyPair(PluginConfiguration["cert_file_path"], PluginConfiguration["key_file_path"]) if err != nil { - Log("Error when loading cert %s", err.Error()) + message := fmt.Sprintf("Error when loading cert %s", err.Error()) + SendException(message) + Log(message) log.Fatalf("Error when loading cert %s", err.Error()) } From a58998ec5a03b3a4bd502a9fb7be5e0bdfd3eee2 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 30 Oct 2018 09:52:36 -0700 Subject: [PATCH 028/160] Changes to send omsagent/omsagent-rs kubectl logs to App Insights (#159) * Changes to send omsagent/omsagent-rs kubectl logs to App Insights * PR Feedback --- installer/conf/td-agent-bit.conf | 9 +++ source/code/go/src/plugins/oms.go | 37 +++++---- source/code/go/src/plugins/out_oms.go | 12 ++- source/code/go/src/plugins/telemetry.go | 102 +++++++++++++----------- source/code/go/src/plugins/utils.go | 21 ++++- 5 files changed, 107 insertions(+), 74 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index b39587a97..2a6199987 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -14,6 +14,15 @@ Path_Key filepath Skip_Long_Lines On +[INPUT] + Name tail + Tag oms.container.log.flbplugin.* + Path /var/log/containers/omsagent*.log + DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db + Mem_Buf_Limit 30m + Path_Key filepath + Skip_Long_Lines On + [OUTPUT] Name oms EnableTelemetry true diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 665c3f9f2..e0abaea1f 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -223,7 +223,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { for _, record := range tailPluginRecords { - containerID := GetContainerIDFromFilePath(toString(record["filepath"])) + containerID := GetContainerIDFromFilePath(ToString(record["filepath"])) if containerID == "" || containsKey(ignoreIDSet, containerID) { continue @@ -231,9 +231,9 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap := make(map[string]string) - stringMap["LogEntry"] = toString(record["log"]) - stringMap["LogEntrySource"] = toString(record["stream"]) - stringMap["LogEntryTimeStamp"] = toString(record["time"]) + stringMap["LogEntry"] = ToString(record["log"]) + stringMap["LogEntrySource"] = ToString(record["stream"]) + stringMap["LogEntryTimeStamp"] = ToString(record["time"]) stringMap["SourceSystem"] = "Containers" stringMap["Id"] = containerID @@ -314,16 +314,6 @@ func containsKey(currentMap map[string]bool, key string) bool { return c } -func toString(s interface{}) string { - switch t := s.(type) { - case []byte: - // prevent encoding to base64 - return string(t) - default: - return "" - } -} - // GetContainerIDFromFilePath Gets the container ID From the file Path func GetContainerIDFromFilePath(filepath string) string { start := strings.LastIndex(filepath, "-") @@ -338,12 +328,19 @@ func GetContainerIDFromFilePath(filepath string) string { } // InitializePlugin reads and populates plugin configuration -func InitializePlugin(pluginConfPath string) { +func InitializePlugin(pluginConfPath string, agentVersion string) { IgnoreIDSet = make(map[string]bool) ImageIDMap = make(map[string]string) NameIDMap = make(map[string]string) + ret, err := InitializeTelemetryClient(agentVersion) + if ret != 0 || err != nil { + message := fmt.Sprintf("Error During Telemetry Initialization :%s", err.Error()) + fmt.Printf(message) + Log(message) + } + pluginConfig, err := ReadConfiguration(pluginConfPath) if err != nil { message := fmt.Sprintf("Error Reading plugin config path : %s \n", err.Error()) @@ -355,9 +352,11 @@ func InitializePlugin(pluginConfPath string) { omsadminConf, err := ReadConfiguration(pluginConfig["omsadmin_conf_path"]) if err != nil { - Log(err.Error()) - SendException(err.Error()) - log.Fatalf("Error Reading omsadmin configuration %s\n", err.Error()) + message := fmt.Sprintf("Error Reading omsadmin configuration %s\n", err.Error()) + Log(message) + SendException(message) + time.Sleep(30 * time.Second) + log.Fatalln(message) } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] WorkspaceID = omsadminConf["WORKSPACE_ID"] @@ -396,7 +395,7 @@ func InitializePlugin(pluginConfPath string) { Log(message) SendException(message) } - Computer = strings.TrimSuffix(toString(containerHostName), "\n") + Computer = strings.TrimSuffix(ToString(containerHostName), "\n") Log("Computer == %s \n", Computer) // Initialize KubeAPI Client diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index e2ee324e7..133e0f039 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -19,12 +19,12 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { // ctx (context) pointer to fluentbit context (state/ c code) func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") - InitializePlugin(ContainerLogPluginConfFilePath) + agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") + InitializePlugin(ContainerLogPluginConfFilePath, agentVersion) enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushIntervalSeconds") - agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") - go SendContainerLogFlushRateMetric(telemetryPushInterval, agentVersion) + go SendContainerLogPluginMetrics(telemetryPushInterval) } else { Log("Telemetry is not enabled for the plugin %s \n", output.FLBPluginConfigKey(ctx, "Name")) return output.FLB_OK @@ -50,6 +50,12 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { } records = append(records, record) } + + incomingTag := C.GoString(tag) + if strings.Contains(strings.ToLower(incomingTag), "oms.container.log.flbplugin") { + return PushToAppInsightsTraces(records) + } + return PostDataHelper(records) } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 72454948d..d943c8eda 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -4,12 +4,12 @@ import ( "encoding/base64" "errors" "os" - "runtime" "strconv" "strings" "time" "github.com/Microsoft/ApplicationInsights-Go/appinsights" + "github.com/fluent/fluent-bit-go/output" ) var ( @@ -41,8 +41,8 @@ const ( eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" ) -// initialize initializes the telemetry artifacts -func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, error) { +// SendContainerLogPluginMetrics is a go-routine that flushes the data periodically (every 5 mins to App Insights) +func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { telemetryPushInterval, err := strconv.Atoi(telemetryPushIntervalProperty) if err != nil { @@ -52,6 +52,49 @@ func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryPushInterval)) + start := time.Now() + SendEvent(eventNameContainerLogInit, make(map[string]string)) + + for ; true; <-ContainerLogTelemetryTicker.C { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + elapsed := time.Since(start) + ContainerLogTelemetryMutex.Lock() + flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 + logRate := FlushedRecordsCount / float64(elapsed/time.Second) + FlushedRecordsCount = 0.0 + FlushedRecordsTimeTaken = 0.0 + ContainerLogTelemetryMutex.Unlock() + + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + TelemetryClient.Track(logRateMetric) + start = time.Now() + } +} + +// SendEvent sends an event to App Insights +func SendEvent(eventName string, dimensions map[string]string) { + Log("Sending Event : %s\n", eventName) + event := appinsights.NewEventTelemetry(eventName) + + // add any extra Properties + for k, v := range dimensions { + event.Properties[k] = v + } + + TelemetryClient.Track(event) +} + +// SendException send an event to the configured app insights instance +func SendException(err interface{}) { + if TelemetryClient != nil { + TelemetryClient.TrackException(err) + } +} + +// InitializeTelemetryClient sets up the telemetry client to send telemetry to the App Insights instance +func InitializeTelemetryClient(agentVersion string) (int, error) { encodedIkey := os.Getenv(envAppInsightsAuth) if encodedIkey == "" { Log("Environment Variable Missing \n") @@ -103,51 +146,14 @@ func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, return 0, nil } -// SendContainerLogFlushRateMetric is a go-routine that flushes the data periodically (every 5 mins to App Insights) -func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agentVersion string) { - - ret, err := initialize(telemetryPushIntervalProperty, agentVersion) - if ret != 0 || err != nil { - Log("Error During Telemetry Initialization :%s", err.Error()) - runtime.Goexit() - } - start := time.Now() - SendEvent(eventNameContainerLogInit, make(map[string]string)) - - for ; true; <-ContainerLogTelemetryTicker.C { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) - elapsed := time.Since(start) - ContainerLogTelemetryMutex.Lock() - flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 - logRate := FlushedRecordsCount / float64(elapsed/time.Second) - FlushedRecordsCount = 0.0 - FlushedRecordsTimeTaken = 0.0 - ContainerLogTelemetryMutex.Unlock() - - flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(flushRateMetric) - logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) - TelemetryClient.Track(logRateMetric) - start = time.Now() - } -} - -// SendEvent sends an event to App Insights -func SendEvent(eventName string, dimensions map[string]string) { - Log("Sending Event : %s\n", eventName) - event := appinsights.NewEventTelemetry(eventName) - - // add any extra Properties - for k, v := range dimensions { - event.Properties[k] = v +// PushToAppInsightsTraces sends the log lines as trace messages to the configured App Insights Instance +func PushToAppInsightsTraces(records []map[interface{}]interface{}) int { + var logLines []string + for _, record := range records { + logLines = append(logLines, ToString(record["log"])) } - TelemetryClient.Track(event) -} - -// SendException send an event to the configured app insights instance -func SendException(err interface{}) { - if TelemetryClient != nil { - TelemetryClient.TrackException(err) - } + traceEntry := strings.Join(logLines, "\n") + TelemetryClient.TrackTrace(traceEntry, 1) + return output.FLB_OK } diff --git a/source/code/go/src/plugins/utils.go b/source/code/go/src/plugins/utils.go index 94db033bd..91e433a0f 100644 --- a/source/code/go/src/plugins/utils.go +++ b/source/code/go/src/plugins/utils.go @@ -8,6 +8,7 @@ import ( "net/http" "os" "strings" + "time" ) // ReadConfiguration reads a property file @@ -21,8 +22,8 @@ func ReadConfiguration(filename string) (map[string]string, error) { file, err := os.Open(filename) if err != nil { SendException(err) - log.Fatal(err) - + time.Sleep(30 * time.Second) + fmt.Printf("%s", err.Error()) return nil, err } defer file.Close() @@ -43,7 +44,8 @@ func ReadConfiguration(filename string) (map[string]string, error) { if err := scanner.Err(); err != nil { SendException(err) - log.Fatal(err) + time.Sleep(30 * time.Second) + log.Fatalf("%s", err.Error()) return nil, err } @@ -52,11 +54,11 @@ func ReadConfiguration(filename string) (map[string]string, error) { // CreateHTTPClient used to create the client for sending post requests to OMSEndpoint func CreateHTTPClient() { - cert, err := tls.LoadX509KeyPair(PluginConfiguration["cert_file_path"], PluginConfiguration["key_file_path"]) if err != nil { message := fmt.Sprintf("Error when loading cert %s", err.Error()) SendException(message) + time.Sleep(30 * time.Second) Log(message) log.Fatalf("Error when loading cert %s", err.Error()) } @@ -72,3 +74,14 @@ func CreateHTTPClient() { Log("Successfully created HTTP Client") } + +// ToString converts an interface into a string +func ToString(s interface{}) string { + switch t := s.(type) { + case []byte: + // prevent encoding to base64 + return string(t) + default: + return "" + } +} From 4c2da9f831d5aa39edc3c0096ad639f3c01243a1 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 5 Nov 2018 15:46:02 -0800 Subject: [PATCH 029/160] Rashmi/fluentd docker inventory (#160) * first stab * changes * changes * docker util changes * working tested util * input plugin and conf * changes * changes * changes * changes * changes * working containerinventory * fixing omi removal from container.conf * removing comments * file write and read * deleted containers working * changes * changes * socket timeout * deleting test files * adding log * fixing comment * appinsights changes * changes * tel changes * changes * changes * changes * changes * lib changes * changes * changes * fixes * PR comments * changes * updating the ownership * changes * changes * changes to container data * removing comment * changes * adding collection time * bug fix * env string truncation * changes for acs-engine test --- installer/conf/container.conf | 46 +-- installer/datafiles/base_container.data | 61 +++- .../code/plugin/ApplicationInsightsUtility.rb | 142 ++++++++++ source/code/plugin/ContainerInventoryState.rb | 65 +++++ source/code/plugin/DockerApiClient.rb | 162 +++++++++++ source/code/plugin/DockerApiRestHelper.rb | 55 ++++ source/code/plugin/in_containerinventory.rb | 266 ++++++++++++++++++ .../code/plugin/lib/application_insights.rb | 9 + .../channel/asynchronous_queue.rb | 58 ++++ .../channel/asynchronous_sender.rb | 133 +++++++++ .../channel/contracts/application.rb | 13 + .../channel/contracts/availability_data.rb | 34 +++ .../channel/contracts/base.rb | 13 + .../channel/contracts/cloud.rb | 14 + .../channel/contracts/data.rb | 14 + .../channel/contracts/data_point.rb | 25 ++ .../channel/contracts/data_point_type.rb | 7 + .../channel/contracts/dependency_kind.rb | 9 + .../contracts/dependency_source_type.rb | 9 + .../channel/contracts/device.rb | 18 ++ .../channel/contracts/domain.rb | 10 + .../channel/contracts/envelope.rb | 32 +++ .../channel/contracts/event_data.rb | 28 ++ .../channel/contracts/exception_data.rb | 35 +++ .../channel/contracts/exception_details.rb | 28 ++ .../channel/contracts/internal.rb | 15 + .../channel/contracts/json_serializable.rb | 59 ++++ .../channel/contracts/location.rb | 13 + .../channel/contracts/message_data.rb | 24 ++ .../channel/contracts/metric_data.rb | 27 ++ .../channel/contracts/operation.rb | 17 ++ .../channel/contracts/page_view_data.rb | 33 +++ .../channel/contracts/page_view_perf_data.rb | 39 +++ .../contracts/remote_dependency_data.rb | 40 +++ .../channel/contracts/reopenings.rb | 27 ++ .../channel/contracts/request_data.rb | 35 +++ .../channel/contracts/session.rb | 14 + .../channel/contracts/severity_level.rb | 13 + .../channel/contracts/stack_frame.rb | 17 ++ .../channel/contracts/user.rb | 15 + .../lib/application_insights/channel/event.rb | 68 +++++ .../channel/queue_base.rb | 73 +++++ .../channel/sender_base.rb | 88 ++++++ .../channel/synchronous_queue.rb | 45 +++ .../channel/synchronous_sender.rb | 17 ++ .../channel/telemetry_channel.rb | 131 +++++++++ .../channel/telemetry_context.rb | 85 ++++++ .../rack/track_request.rb | 154 ++++++++++ .../application_insights/telemetry_client.rb | 232 +++++++++++++++ .../unhandled_exception.rb | 49 ++++ .../lib/application_insights/version.rb | 3 + 51 files changed, 2581 insertions(+), 38 deletions(-) create mode 100644 source/code/plugin/ApplicationInsightsUtility.rb create mode 100644 source/code/plugin/ContainerInventoryState.rb create mode 100644 source/code/plugin/DockerApiClient.rb create mode 100644 source/code/plugin/DockerApiRestHelper.rb create mode 100644 source/code/plugin/in_containerinventory.rb create mode 100644 source/code/plugin/lib/application_insights.rb create mode 100644 source/code/plugin/lib/application_insights/channel/asynchronous_queue.rb create mode 100644 source/code/plugin/lib/application_insights/channel/asynchronous_sender.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/application.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/availability_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/base.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/cloud.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/data_point.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/data_point_type.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/dependency_kind.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/device.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/domain.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/envelope.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/event_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/exception_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/exception_details.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/internal.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/location.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/message_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/metric_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/operation.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/page_view_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/reopenings.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/request_data.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/session.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/severity_level.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/stack_frame.rb create mode 100644 source/code/plugin/lib/application_insights/channel/contracts/user.rb create mode 100644 source/code/plugin/lib/application_insights/channel/event.rb create mode 100644 source/code/plugin/lib/application_insights/channel/queue_base.rb create mode 100644 source/code/plugin/lib/application_insights/channel/sender_base.rb create mode 100644 source/code/plugin/lib/application_insights/channel/synchronous_queue.rb create mode 100644 source/code/plugin/lib/application_insights/channel/synchronous_sender.rb create mode 100644 source/code/plugin/lib/application_insights/channel/telemetry_channel.rb create mode 100644 source/code/plugin/lib/application_insights/channel/telemetry_context.rb create mode 100644 source/code/plugin/lib/application_insights/rack/track_request.rb create mode 100644 source/code/plugin/lib/application_insights/telemetry_client.rb create mode 100644 source/code/plugin/lib/application_insights/unhandled_exception.rb create mode 100644 source/code/plugin/lib/application_insights/version.rb diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 17317871c..798bd8eb6 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -9,22 +9,10 @@ # Container inventory - type omi - run_interval 60s - tag oms.container.containerinventory - items [ - ["root/cimv2","Container_ContainerInventory"] - ] - - -# Image inventory - - type omi - run_interval 60s - tag oms.container.imageinventory - items [ - ["root/cimv2","Container_ImageInventory"] - ] + type containerinventory + tag oms.containerinsights.containerinventory + run_interval 60s + log_level debug # Container host inventory @@ -45,11 +33,6 @@ log_level debug -# Filter for correct format to endpoint - - type filter_container - - type out_oms_api log_level debug @@ -63,33 +46,22 @@ max_retry_wait 9m - + type out_oms log_level debug + num_threads 5 buffer_chunk_limit 20m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk flush_interval 20s retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - - - type out_oms - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_imageinventory*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s + retry_wait 30s max_retry_wait 9m - + type out_oms log_level debug num_threads 5 diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 85a128b2a..7181929e2 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -37,6 +37,57 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/in_kube_services.rb; source/code/plugin/in_kube_services.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root +/opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/code/plugin/ApplicationInsightsUtility.rb; 644; root; root +/opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/code/plugin/ContainerInventoryState.rb; 644; root; root +/opt/microsoft/omsagent/plugin/DockerApiClient.rb; source/code/plugin/DockerApiClient.rb; 644; root; root +/opt/microsoft/omsagent/plugin/DockerApiRestHelper.rb; source/code/plugin/DockerApiRestHelper.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_containerinventory.rb; source/code/plugin/in_containerinventory.rb; 644; root; root + +/opt/microsoft/omsagent/plugin/lib/application_insights/version.rb; source/code/plugin/lib/application_insights/version.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/rack/track_request.rb; source/code/plugin/lib/application_insights/rack/track_request.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/unhandled_exception.rb; source/code/plugin/lib/application_insights/unhandled_exception.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/telemetry_client.rb; source/code/plugin/lib/application_insights/telemetry_client.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/queue_base.rb; source/code/plugin/lib/application_insights/channel/queue_base.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/asynchronous_queue.rb; source/code/plugin/lib/application_insights/channel/asynchronous_queue.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/synchronous_sender.rb; source/code/plugin/lib/application_insights/channel/synchronous_sender.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data_point_type.rb; source/code/plugin/lib/application_insights/channel/contracts/data_point_type.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data_point.rb; source/code/plugin/lib/application_insights/channel/contracts/data_point.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/stack_frame.rb; source/code/plugin/lib/application_insights/channel/contracts/stack_frame.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/request_data.rb; source/code/plugin/lib/application_insights/channel/contracts/request_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/session.rb; source/code/plugin/lib/application_insights/channel/contracts/session.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/page_view_data.rb; source/code/plugin/lib/application_insights/channel/contracts/page_view_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb; source/code/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/exception_data.rb; source/code/plugin/lib/application_insights/channel/contracts/exception_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/location.rb; source/code/plugin/lib/application_insights/channel/contracts/location.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/operation.rb; source/code/plugin/lib/application_insights/channel/contracts/operation.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data.rb; source/code/plugin/lib/application_insights/channel/contracts/data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/event_data.rb; source/code/plugin/lib/application_insights/channel/contracts/event_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/metric_data.rb; source/code/plugin/lib/application_insights/channel/contracts/metric_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/device.rb; source/code/plugin/lib/application_insights/channel/contracts/device.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/message_data.rb; source/code/plugin/lib/application_insights/channel/contracts/message_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb; source/code/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/user.rb; source/code/plugin/lib/application_insights/channel/contracts/user.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/severity_level.rb; source/code/plugin/lib/application_insights/channel/contracts/severity_level.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/application.rb; source/code/plugin/lib/application_insights/channel/contracts/application.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/dependency_kind.rb; source/code/plugin/lib/application_insights/channel/contracts/dependency_kind.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/cloud.rb; source/code/plugin/lib/application_insights/channel/contracts/cloud.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/envelope.rb; source/code/plugin/lib/application_insights/channel/contracts/envelope.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/json_serializable.rb; source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/domain.rb; source/code/plugin/lib/application_insights/channel/contracts/domain.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/base.rb; source/code/plugin/lib/application_insights/channel/contracts/base.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/reopenings.rb; source/code/plugin/lib/application_insights/channel/contracts/reopenings.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb; source/code/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/internal.rb; source/code/plugin/lib/application_insights/channel/contracts/internal.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/availability_data.rb; source/code/plugin/lib/application_insights/channel/contracts/availability_data.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/exception_details.rb; source/code/plugin/lib/application_insights/channel/contracts/exception_details.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/synchronous_queue.rb; source/code/plugin/lib/application_insights/channel/synchronous_queue.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/sender_base.rb; source/code/plugin/lib/application_insights/channel/sender_base.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/telemetry_context.rb; source/code/plugin/lib/application_insights/channel/telemetry_context.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/asynchronous_sender.rb; source/code/plugin/lib/application_insights/channel/asynchronous_sender.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/telemetry_channel.rb; source/code/plugin/lib/application_insights/channel/telemetry_channel.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/event.rb; source/code/plugin/lib/application_insights/channel/event.rb; 644; root; root +/opt/microsoft/omsagent/plugin/lib/application_insights.rb; source/code/plugin/lib/application_insights.rb; 644; root; root + /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root @@ -75,12 +126,17 @@ MAINTAINER: 'Microsoft Corporation' /var/opt/microsoft/docker-cimprov; 755; root; root /var/opt/microsoft/docker-cimprov/state; 755; root; root /var/opt/microsoft/docker-cimprov/state/ContainerInventory; 755; root; root -/var/opt/microsoft/docker-cimprov/state/ImageInventory; 755; root; root /var/opt/microsoft/docker-cimprov/log; 755; root; root /opt/td-agent-bit; 755; root; root;sysdir /opt/td-agent-bit/bin; 755; root; root;sysdir +/opt/microsoft/omsagent/plugin/lib; 755; root; root; sysdir +/opt/microsoft/omsagent/plugin/lib/application_insights; 755; root; root; sysdir +/opt/microsoft/omsagent/plugin/lib/application_insights/channel; 755; root; root; sysdir +/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir +/opt/microsoft/omsagent/plugin/lib/application_insights/rack; 755; root; root; sysdir + %Dependencies %Postinstall_10 @@ -90,6 +146,9 @@ WriteInstallInfo() { } WriteInstallInfo +#Make omsagent owner for ContainerInventory directory. This is needed for ruby plugin to have access +chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/ContainerInventory + # Get the state file in place with proper permissions touch /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt chmod 644 /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb new file mode 100644 index 000000000..14fc9f2f8 --- /dev/null +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -0,0 +1,142 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +class ApplicationInsightsUtility + require_relative 'lib/application_insights' + require_relative 'omslog' + require_relative 'DockerApiClient' + require 'json' + require 'base64' + + @@HeartBeat = 'HeartBeatEvent' + @@Exception = 'ExceptionEvent' + @@AcsClusterType = 'ACS' + @@AksClusterType = 'AKS' + @@DaemonsetControllerType = 'DaemonSet' + @OmsAdminFilePath = '/etc/opt/microsoft/omsagent/conf/omsadmin.conf' + @@EnvAcsResourceName = 'ACS_RESOURCE_NAME' + @@EnvAksRegion = 'AKS_REGION' + @@EnvAgentVersion = 'AGENT_VERSION' + @@EnvApplicationInsightsKey = 'APPLICATIONINSIGHTS_AUTH' + @@CustomProperties = {} + @@Tc = nil + + def initialize + end + + class << self + #Set default properties for telemetry event + def initializeUtility() + begin + resourceInfo = ENV['AKS_RESOURCE_ID'] + if resourceInfo.nil? || resourceInfo.empty? + @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName] + @@CustomProperties["ClusterType"] = @@AcsClusterType + @@CustomProperties["SubscriptionID"] = "" + @@CustomProperties["ResourceGroupName"] = "" + @@CustomProperties["ClusterName"] = "" + @@CustomProperties["Region"] = "" + else + @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo + begin + splitStrings = resourceInfo.split('/') + subscriptionId = splitStrings[2] + resourceGroupName = splitStrings[4] + clusterName = splitStrings[8] + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}") + end + @@CustomProperties["ClusterType"] = @@AksClusterType + @@CustomProperties["SubscriptionID"] = subscriptionId + @@CustomProperties["ResourceGroupName"] = resourceGroupName + @@CustomProperties["ClusterName"] = clusterName + @@CustomProperties["Region"] = ENV[@@EnvAksRegion] + end + @@CustomProperties['ControllerType'] = @@DaemonsetControllerType + dockerInfo = DockerApiClient.dockerInfo + @@CustomProperties['DockerVersion'] = dockerInfo['Version'] + @@CustomProperties['DockerApiVersion'] = dockerInfo['ApiVersion'] + @@CustomProperties['WorkspaceID'] = getWorkspaceId + @@CustomProperties['AgentVersion'] = ENV[@@EnvAgentVersion] + encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] + if !encodedAppInsightsKey.nil? + decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) + @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") + end + end + + def sendHeartBeatEvent(pluginName) + begin + eventName = pluginName + @@HeartBeat + if !(@@Tc.nil?) + @@Tc.track_event eventName , :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Heartbeat Telemetry sent successfully") + end + rescue =>errorStr + $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") + end + end + + def sendCustomEvent(pluginName, properties) + begin + if !(@@Tc.nil?) + @@Tc.track_metric 'LastProcessedContainerInventoryCount', properties['ContainerCount'], + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Container Count Telemetry sent successfully") + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") + end + end + + def sendExceptionTelemetry(errorStr) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility + end + if !(@@Tc.nil?) + @@Tc.track_exception errorStr , :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Exception Telemetry sent successfully") + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") + end + end + + #Method to send heartbeat and container inventory count + def sendTelemetry(pluginName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility + end + @@CustomProperties['Computer'] = properties['Computer'] + sendHeartBeatEvent(pluginName) + sendCustomEvent(pluginName, properties) + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") + end + end + + def getWorkspaceId() + begin + adminConf = {} + confFile = File.open(@OmsAdminFilePath, "r") + confFile.each_line do |line| + splitStrings = line.split('=') + adminConf[splitStrings[0]] = splitStrings[1] + end + workspaceId = adminConf['WORKSPACE_ID'] + return workspaceId + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/ContainerInventoryState.rb b/source/code/plugin/ContainerInventoryState.rb new file mode 100644 index 000000000..7e5ca18e8 --- /dev/null +++ b/source/code/plugin/ContainerInventoryState.rb @@ -0,0 +1,65 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +class ContainerInventoryState + require 'json' + require_relative 'omslog' + @@InventoryDirectory = "/var/opt/microsoft/docker-cimprov/state/ContainerInventory/" + + def initialize + end + + class << self + # Write the container information to disk with the data that is obtained from the current plugin execution + def writeContainerState(container) + containerId = container['InstanceID'] + if !containerId.nil? && !containerId.empty? + begin + file = File.open(@@InventoryDirectory + containerId, "w") + if !file.nil? + file.write(container.to_json) + file.close + else + $log.warn("Exception while opening file with id: #{containerId}") + end + rescue => errorStr + $log.warn("Exception in writeContainerState: #{errorStr}") + end + end + end + + # Reads the container state for the deleted container + def readContainerState(containerId) + begin + containerObject = nil + filepath = @@InventoryDirectory + containerId + file = File.open(filepath, "r") + if !file.nil? + fileContents = file.read + containerObject = JSON.parse(fileContents) + file.close + # Delete the file since the state is update to deleted + File.delete(filepath) if File.exist?(filepath) + else + $log.warn("Open file for container with id returned nil: #{containerId}") + end + rescue => errorStr + $log.warn("Exception in readContainerState: #{errorStr}") + end + return containerObject + end + + # Gets the containers that were written to the disk with the previous plugin invocation but do not exist in the current container list + # Doing this because we need to update the container state to deleted. Else this will stay running forever. + def getDeletedContainers(containerIds) + deletedContainers = nil + begin + previousContainerList = Dir.entries(@@InventoryDirectory) - [".", ".."] + deletedContainers = previousContainerList - containerIds + rescue => errorStr + $log.warn("Exception in getDeletedContainers: #{errorStr}") + end + return deletedContainers + end + end +end \ No newline at end of file diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb new file mode 100644 index 000000000..b93411980 --- /dev/null +++ b/source/code/plugin/DockerApiClient.rb @@ -0,0 +1,162 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +class DockerApiClient + + require 'socket' + require 'json' + require 'timeout' + require_relative 'omslog' + require_relative 'DockerApiRestHelper' + require_relative 'ApplicationInsightsUtility' + + @@SocketPath = "/var/run/docker.sock" + @@ChunkSize = 4096 + @@TimeoutInSeconds = 5 + @@PluginName = 'ContainerInventory' + def initialize + end + + class << self + # Make docker socket call for requests + def getResponse(request, isMultiJson) + begin + socket = UNIXSocket.new(@@SocketPath) + dockerResponse = "" + isTimeOut = false + socket.write(request) + # iterate through the response until the last chunk is less than the chunk size so that we can read all data in socket. + loop do + begin + responseChunk = "" + timeout(@@TimeoutInSeconds) do + responseChunk = socket.recv(@@ChunkSize) + end + dockerResponse += responseChunk + rescue Timeout::Error + $log.warn("Socket read timedout for request: #{request} @ #{Time.now.utc.iso8601}") + isTimeOut = true + end + break if responseChunk.length < @@ChunkSize + end + socket.close + return (isTimeOut)? nil : parseResponse(dockerResponse, isMultiJson) + rescue => errorStr + $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def parseResponse(dockerResponse, isMultiJson) + # Doing this because the response is in the raw format and includes headers. + # Need to do a regex match to extract the json part of the response - Anything between [{}] in response + parsedJsonResponse = nil + begin + jsonResponse = isMultiJson ? dockerResponse[/\[{.+}\]/] : dockerResponse[/{.+}/] + rescue => errorStr + $log.warn("Regex match for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + end + begin + if jsonResponse != nil + parsedJsonResponse = JSON.parse(jsonResponse) + end + rescue => errorStr + $log.warn("Json parsing for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return parsedJsonResponse + end + + + def getDockerHostName() + dockerHostName = "" + request = DockerApiRestHelper.restDockerInfo + response = getResponse(request, false) + if (response != nil) + dockerHostName = response['Name'] + end + return dockerHostName + end + + def listContainers() + ids = [] + request = DockerApiRestHelper.restDockerPs + containers = getResponse(request, true) + if !containers.nil? && !containers.empty? + containers.each do |container| + ids.push(container['Id']) + end + end + return ids + end + + # This method splits the tag value into an array - repository, image and tag + def getImageRepositoryImageTag(tagValue) + result = ["", "", ""] + begin + if !tagValue.empty? + # Find delimiters in the string of format repository/image:imagetag + slashLocation = tagValue.index('/') + colonLocation = tagValue.index(':') + if !colonLocation.nil? + if slashLocation.nil? + # image:imagetag + result[1] = tagValue[0..(colonLocation-1)] + else + # repository/image:imagetag + result[0] = tagValue[0..(slashLocation-1)] + result[1] = tagValue[(slashLocation + 1)..(colonLocation - 1)] + end + result[2] = tagValue[(colonLocation + 1)..-1] + end + end + rescue => errorStr + $log.warn("Exception at getImageRepositoryImageTag: #{errorStr} @ #{Time.now.utc.iso8601}") + end + return result + end + + # Image is in the format repository/image:imagetag - This method creates a hash of image id and repository, image and tag + def getImageIdMap() + result = nil + begin + request = DockerApiRestHelper.restDockerImages + images = getResponse(request, true) + if !images.nil? && !images.empty? + result = {} + images.each do |image| + tagValue = "" + tags = image['RepoTags'] + if !tags.nil? && tags.kind_of?(Array) && tags.length > 0 + tagValue = tags[0] + end + idValue = image['Id'] + if !idValue.nil? + result[idValue] = getImageRepositoryImageTag(tagValue) + end + end + end + rescue => errorStr + $log.warn("Exception at getImageIdMap: #{errorStr} @ #{Time.now.utc.iso8601}") + end + return result + end + + def dockerInspectContainer(id) + request = DockerApiRestHelper.restDockerInspect(id) + return getResponse(request, false) + end + + # This method returns docker version and docker api version for telemetry + def dockerInfo() + request = DockerApiRestHelper.restDockerVersion + response = getResponse(request, false) + dockerInfo = {} + if (response != nil) + dockerInfo['Version'] = response['Version'] + dockerInfo['ApiVersion'] = response['ApiVersion'] + end + return dockerInfo + end + end +end \ No newline at end of file diff --git a/source/code/plugin/DockerApiRestHelper.rb b/source/code/plugin/DockerApiRestHelper.rb new file mode 100644 index 000000000..76361b122 --- /dev/null +++ b/source/code/plugin/DockerApiRestHelper.rb @@ -0,0 +1,55 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +class DockerApiRestHelper + def initialize + end + + class << self + # Create the REST request to list images + # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#list-images + # returns Request in string format + def restDockerImages() + begin + return "GET /images/json?all=0 HTTP/1.1\r\nHost: localhost\r\n\r\n"; + end + end + + # Create the REST request to list containers + # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#list-containers + # returns Request in string format + def restDockerPs() + begin + return "GET /containers/json?all=1 HTTP/1.1\r\nHost: localhost\r\n\r\n"; + end + end + + # Create the REST request to inspect a container + # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#inspect-a-container + # parameter - ID of the container to be inspected + # returns Request in string format + def restDockerInspect(id) + begin + return "GET /containers/" + id + "/json HTTP/1.1\r\nHost: localhost\r\n\r\n"; + end + end + + # Create the REST request to get docker info + # https://docs.docker.com/engine/reference/api/docker_remote_api_v1.21/#get-container-stats-based-on-resource-usage + # returns Request in string format + def restDockerInfo() + begin + return "GET /info HTTP/1.1\r\nHost: localhost\r\n\r\n"; + end + end + + # Create the REST request to get docker info + # https://docs.docker.com/engine/api/v1.21/#21-containers + # returns Request in string format + def restDockerVersion() + begin + return "GET /version HTTP/1.1\r\nHost: localhost\r\n\r\n"; + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb new file mode 100644 index 000000000..43811e1e1 --- /dev/null +++ b/source/code/plugin/in_containerinventory.rb @@ -0,0 +1,266 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + + class Container_Inventory_Input < Input + Plugin.register_input('containerinventory', self) + + @@PluginName = 'ContainerInventory' + @@RunningState = 'Running' + @@FailedState = 'Failed' + @@StoppedState = 'Stopped' + @@PausedState = 'Paused' + + def initialize + super + require 'json' + require_relative 'DockerApiClient' + require_relative 'ContainerInventoryState' + require_relative 'ApplicationInsightsUtility' + require_relative 'omslog' + end + + config_param :run_interval, :time, :default => '1m' + config_param :tag, :string, :default => "oms.containerinsights.containerinventory" + + def configure (conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@telemetryTimeTracker = DateTime.now.to_time.to_i + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def obtainContainerConfig(instance, container) + begin + configValue = container['Config'] + if !configValue.nil? + instance['ContainerHostname'] = configValue['Hostname'] + + envValue = configValue['Env'] + envValueString = (envValue.nil?) ? "" : envValue.to_s + # Restricting the ENV string value to 200kb since the size of this string can go very high + if envValueString.length > 200000 + envValueStringTruncated = envValueString.slice(0..200000) + lastIndex = envValueStringTruncated.rindex("\", ") + if !lastIndex.nil? + envValueStringTruncated = envValueStringTruncated.slice(0..lastIndex) + "]" + end + instance['EnvironmentVar'] = envValueStringTruncated + else + instance['EnvironmentVar'] = envValueString + end + + cmdValue = configValue['Cmd'] + cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s + instance['Command'] = cmdValueString + + instance['ComposeGroup'] = "" + labelsValue = configValue['Labels'] + if !labelsValue.nil? && !labelsValue.empty? + instance['ComposeGroup'] = labelsValue['com.docker.compose.project'] + end + else + $log.warn("Attempt in ObtainContainerConfig to get container: #{container['Id']} config information returned null") + end + rescue => errorStr + $log.warn("Exception in obtainContainerConfig: #{errorStr}") + end + end + + def obtainContainerState(instance, container) + begin + stateValue = container['State'] + if !stateValue.nil? + exitCodeValue = stateValue['ExitCode'] + # Exit codes less than 0 are not supported by the engine + if exitCodeValue < 0 + exitCodeValue = 128 + $log.info("obtainContainerState::Container: #{container['Id']} returned negative exit code") + end + instance['ExitCode'] = exitCodeValue + if exitCodeValue > 0 + instance['State'] = @@FailedState + else + # Set the Container status : Running/Paused/Stopped + runningValue = stateValue['Running'] + if runningValue + pausedValue = stateValue['Paused'] + # Checking for paused within running is true state because docker returns true for both Running and Paused fields when the container is paused + if pausedValue + instance['State'] = @@PausedState + else + instance['State'] = @@RunningState + end + else + instance['State'] = @@StoppedState + end + end + instance['StartedTime'] = stateValue['StartedAt'] + instance['FinishedTime'] = stateValue['FinishedAt'] + else + $log.info("Attempt in ObtainContainerState to get container: #{container['Id']} state information returned null") + end + rescue => errorStr + $log.warn("Exception in obtainContainerState: #{errorStr}") + end + end + + def obtainContainerHostConfig(instance, container) + begin + hostConfig = container['HostConfig'] + if !hostConfig.nil? + links = hostConfig['Links'] + instance['Links'] = "" + if !links.nil? + linksString = links.to_s + instance['Links'] = (linksString == "null")? "" : linksString + end + portBindings = hostConfig['PortBindings'] + instance['Ports'] = "" + if !portBindings.nil? + portBindingsString = portBindings.to_s + instance['Ports'] = (portBindingsString == "null")? "" : portBindingsString + end + else + $log.info("Attempt in ObtainContainerHostConfig to get container: #{container['Id']} host config information returned null") + end + rescue => errorStr + $log.warn("Exception in obtainContainerHostConfig: #{errorStr}") + end + end + + def inspectContainer(id, nameMap) + containerInstance = {} + begin + container = DockerApiClient.dockerInspectContainer(id) + if !container.nil? && !container.empty? + containerInstance['InstanceID'] = container['Id'] + containerInstance['CreatedTime'] = container['Created'] + containerName = container['Name'] + if !containerName.nil? && !containerName.empty? + # Remove the leading / from the name if it exists (this is an API issue) + containerInstance['ElementName'] = (containerName[0] == '/') ? containerName[1..-1] : containerName + end + imageValue = container['Image'] + if !imageValue.nil? && !imageValue.empty? + containerInstance['ImageId'] = imageValue + repoImageTagArray = nameMap[imageValue] + if nameMap.has_key? imageValue + containerInstance['Repository'] = repoImageTagArray[0] + containerInstance['Image'] = repoImageTagArray[1] + containerInstance['ImageTag'] = repoImageTagArray[2] + end + end + obtainContainerConfig(containerInstance, container); + obtainContainerState(containerInstance, container); + obtainContainerHostConfig(containerInstance, container); + end + rescue => errorStr + $log.warn("Exception in inspectContainer: #{errorStr} for container: #{id}") + end + return containerInstance + end + + def enumerate + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + containerInventory = Array.new + $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") + hostname = DockerApiClient.getDockerHostName + begin + containerIds = DockerApiClient.listContainers + if !containerIds.empty? + eventStream = MultiEventStream.new + nameMap = DockerApiClient.getImageIdMap + containerIds.each do |containerId| + inspectedContainer = {} + inspectedContainer = inspectContainer(containerId, nameMap) + inspectedContainer['Computer'] = hostname + inspectedContainer['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated + containerInventory.push inspectedContainer + ContainerInventoryState.writeContainerState(inspectedContainer) + end + # Update the state for deleted containers + deletedContainers = ContainerInventoryState.getDeletedContainers(containerIds) + if !deletedContainers.nil? && !deletedContainers.empty? + deletedContainers.each do |deletedContainer| + container = ContainerInventoryState.readContainerState(deletedContainer) + if !container.nil? + container.each{|k,v| container[k]=v} + container['State'] = "Deleted" + containerInventory.push container + end + end + end + + containerInventory.each do |record| + wrapper = { + "DataType"=>"CONTAINER_INVENTORY_BLOB", + "IPName"=>"ContainerInsights", + "DataItems"=>[record.each{|k,v| record[k]=v}] + } + eventStream.add(emitTime, wrapper) if wrapper + end + router.emit_stream(@tag, eventStream) if eventStream + @@istestvar = ENV['ISTEST'] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + $log.info("containerInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference/60 + if (timeDifferenceInMinutes >= 5) + @@telemetryTimeTracker = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties['Computer'] = hostname + telemetryProperties['ContainerCount'] = containerInventory.length + ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) + end + $log.info("in_container_inventory::enumerate : Processing complete - emitted stream @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn("Exception in enumerate container inventory: #{errorStr}") + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_container_inventory::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_container_inventory::run_periodic: Failed in enumerate container inventory: #{errorStr}" + end + end + @mutex.lock + end + @mutex.unlock + end + + end # Container_Inventory_Input + +end # module \ No newline at end of file diff --git a/source/code/plugin/lib/application_insights.rb b/source/code/plugin/lib/application_insights.rb new file mode 100644 index 000000000..0a683d484 --- /dev/null +++ b/source/code/plugin/lib/application_insights.rb @@ -0,0 +1,9 @@ +require_relative 'application_insights/telemetry_client' +require_relative 'application_insights/unhandled_exception' +require_relative 'application_insights/version' + +module ApplicationInsights + module Rack + autoload :TrackRequest, "application_insights/rack/track_request" + end +end diff --git a/source/code/plugin/lib/application_insights/channel/asynchronous_queue.rb b/source/code/plugin/lib/application_insights/channel/asynchronous_queue.rb new file mode 100644 index 000000000..333f6968b --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/asynchronous_queue.rb @@ -0,0 +1,58 @@ +require_relative 'event' +require_relative 'queue_base' + +module ApplicationInsights + module Channel + # An asynchronous queue for use in conjunction with the {AsynchronousSender}. + # The queue will notify the sender that it needs to pick up items when it + # reaches {#max_queue_length}, or when the consumer calls {#flush} via the + # {#flush_notification} event. + # + # @example + # require 'application_insights' + # require 'thread' + # queue = ApplicationInsights::Channel::AsynchronousQueue.new nil + # Thread.new do + # sleep 1 + # queue.push 1 + # queue.flush + # end + # queue.flush_notification.wait + # queue.flush_notification.clear + # result = queue.pop + class AsynchronousQueue < QueueBase + # Initializes a new instance of the class. + # @param [SenderBase] sender the sender object that will be used in + # conjunction with this queue. In addition to the sender object must + # support a {AsynchronousSender#start} method which is invoked each time + # an item is pushed to the queue as well as use the {#flush_notification} + # event. + def initialize(sender) + @flush_notification = Event.new + super sender + end + + # The flush notification {ApplicationInsights::Channel::Event} that the {#sender} + # will use to get notified that a flush is needed. + # @return [Event] object that the {#sender} can wait on. + attr_reader :flush_notification + + # Adds the passed in item object to the queue and notifies the {#sender} + # to start an asynchronous send operation + # by calling {AsynchronousSender#start}. + # @param [Contracts::Envelope] item the telemetry envelope object to send + # to the service. + def push(item) + super item + @sender.start if @sender + end + + # Flushes the current queue by notifying the {#sender} via the + # {#flush_notification} event. + def flush + @flush_notification.set + @sender.start if @sender + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/asynchronous_sender.rb b/source/code/plugin/lib/application_insights/channel/asynchronous_sender.rb new file mode 100644 index 000000000..da573f08c --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/asynchronous_sender.rb @@ -0,0 +1,133 @@ +require_relative 'sender_base' +require 'thread' + +module ApplicationInsights + module Channel + # An asynchronous sender that works in conjunction with the {AsynchronousQueue}. + # The sender object will start a worker thread that will pull items from the + # {#queue}. The thread will be created when the client calls {#start} and + # will check for queue items every {#send_interval} seconds. The worker thread + # can also be forced to check the queue by setting the + # {AsynchronousQueue#flush_notification} event. + # + # - If no items are found, the thread will go back to sleep. + # - If items are found, the worker thread will send items to the specified + # service in batches of {#send_buffer_size}. + # + # If no queue items are found for {#send_time} seconds, the worker thread + # will shut down (and {#start} will need to be called again). + class AsynchronousSender < SenderBase + SERVICE_ENDPOINT_URI = 'https://dc.services.visualstudio.com/v2/track' + # Initializes a new instance of the class. + # @param [String] service_endpoint_uri the address of the service to send + # telemetry data to. + def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI) + @send_interval = 1.0 + @send_remaining_time = 0 + @send_time = 3.0 + @lock_work_thread = Mutex.new + @work_thread = nil + @start_notification_processed = true + super service_endpoint_uri + end + + # The time span in seconds at which the the worker thread will check the + # {#queue} for items (defaults to: 1.0). + # @return [Fixnum] the interval in seconds. + attr_accessor :send_interval + + # The time span in seconds for which the worker thread will stay alive if + # no items are found in the {#queue} (defaults to 3.0). + # @return [Fixnum] the interval in seconds. + attr_accessor :send_time + + # The worker thread which checks queue items and send data every + # (#send_interval) seconds or upon flush. + # @return [Thread] the work thread + attr_reader :work_thread + + # Calling this method will create a worker thread that checks the {#queue} + # every {#send_interval} seconds for a total duration of {#send_time} + # seconds for new items. If a worker thread has already been created, + # calling this method does nothing. + def start + @start_notification_processed = false + # Maintain one working thread at one time + unless @work_thread + @lock_work_thread.synchronize do + unless @work_thread + local_send_interval = [@send_interval, 0.1].max + @send_remaining_time = [@send_time, local_send_interval].max + @work_thread = Thread.new { run } + @work_thread.abort_on_exception = false + end + end + end + end + + private + + def run + # save the queue locally + local_queue = @queue + if local_queue.nil? + @work_thread = nil + return + end + + begin + # fix up the send interval (can't be lower than 100ms) + local_send_interval = [@send_interval, 0.1].max + + while true + @start_notification_processed = true + while true + # get at most @send_buffer_size items from the queue + data = [] + @send_buffer_size.downto(1) do + item = local_queue.pop + break if not item + data.push item + end + + # if we didn't get any items from the queue, we're done here + break if data.length == 0 + + # reset the send time + @send_remaining_time = @send_time + + # finally send the data + send data + end + + # wait at most @send_interval ms (or until we get signalled) + result = local_queue.flush_notification.wait local_send_interval + if result + local_queue.flush_notification.clear + next + end + + # decrement the remaining time + @send_remaining_time -= local_send_interval + # If remaining time <=0 and there is no start notification unprocessed, + # then stop the working thread + if @send_remaining_time <= 0 && @start_notification_processed + # Note: there is still a chance some start notification could be + # missed, e.g., the start method got triggered between the above and + # following line. However the data is not lost as it would be + # processed later when next start notification comes after the worker + # thread stops. The cost to ensure no notification miss is high where + # a lock is required each time the start method calls. + @work_thread = nil + break + end + end + rescue Exception => e + # Make sure work_thread sets to nil when it terminates abnormally + @work_thread = nil + @logger.error('application_insights') { "Asynchronous sender work thread terminated abnormally: #{e.to_s}" } + end + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/application.rb b/source/code/plugin/lib/application_insights/channel/contracts/application.rb new file mode 100644 index 000000000..071c37385 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/application.rb @@ -0,0 +1,13 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Application + include JsonSerializable + + attr_accessor :ver + + attribute_mapping( + ver: 'ai.application.ver' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/availability_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/availability_data.rb new file mode 100644 index 000000000..d560dd15b --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/availability_data.rb @@ -0,0 +1,34 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class AvailabilityData + include JsonSerializable + + attr_accessor :ver, :id, :name, :duration, :success, :run_location, :message, + :properties, :measurements + + attribute_mapping( + ver: 'ver', + id: 'id', + name: 'name', + duration: 'duration', + success: 'success', + run_location: 'runLocation', + message: 'message', + properties: 'properties', + measurements: 'measurements' + ) + + def ver + @ver ||= 2 + end + + def properties + @properties ||= {} + end + + def measurements + @measurements ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/base.rb b/source/code/plugin/lib/application_insights/channel/contracts/base.rb new file mode 100644 index 000000000..bb88a4625 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/base.rb @@ -0,0 +1,13 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Base + include JsonSerializable + + attr_accessor :base_type + + attribute_mapping( + base_type: 'baseType' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/cloud.rb b/source/code/plugin/lib/application_insights/channel/contracts/cloud.rb new file mode 100644 index 000000000..5aaeeee04 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/cloud.rb @@ -0,0 +1,14 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Cloud + include JsonSerializable + + attr_accessor :role, :role_instance + + attribute_mapping( + role: 'ai.cloud.role', + role_instance: 'ai.cloud.roleInstance' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/data.rb b/source/code/plugin/lib/application_insights/channel/contracts/data.rb new file mode 100644 index 000000000..c7184edfd --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/data.rb @@ -0,0 +1,14 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Data + include JsonSerializable + + attr_accessor :base_type, :base_data + + attribute_mapping( + base_type: 'baseType', + base_data: 'baseData' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/data_point.rb b/source/code/plugin/lib/application_insights/channel/contracts/data_point.rb new file mode 100644 index 000000000..6556b351b --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/data_point.rb @@ -0,0 +1,25 @@ +require_relative 'json_serializable' +require_relative 'data_point_type' + +module ApplicationInsights::Channel::Contracts + class DataPoint + include JsonSerializable + + attr_accessor :ns, :name, :kind, :value, :count, :min, :max, :std_dev + + attribute_mapping( + ns: 'ns', + name: 'name', + kind: 'kind', + value: 'value', + count: 'count', + min: 'min', + max: 'max', + std_dev: 'stdDev' + ) + + def kind + @kind ||= DataPointType::MEASUREMENT + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/data_point_type.rb b/source/code/plugin/lib/application_insights/channel/contracts/data_point_type.rb new file mode 100644 index 000000000..f9816e4a9 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/data_point_type.rb @@ -0,0 +1,7 @@ +module ApplicationInsights::Channel::Contracts + class DataPointType + MEASUREMENT = 0 + + AGGREGATION = 1 + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/dependency_kind.rb b/source/code/plugin/lib/application_insights/channel/contracts/dependency_kind.rb new file mode 100644 index 000000000..38a441499 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/dependency_kind.rb @@ -0,0 +1,9 @@ +module ApplicationInsights::Channel::Contracts + class DependencyKind + SQL = 0 + + HTTP = 1 + + OTHER = 2 + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb b/source/code/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb new file mode 100644 index 000000000..a68dad72b --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb @@ -0,0 +1,9 @@ +module ApplicationInsights::Channel::Contracts + class DependencySourceType + UNDEFINED = 0 + + AIC = 1 + + APMC = 2 + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/device.rb b/source/code/plugin/lib/application_insights/channel/contracts/device.rb new file mode 100644 index 000000000..af6855102 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/device.rb @@ -0,0 +1,18 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Device + include JsonSerializable + + attr_accessor :id, :locale, :model, :oem_name, :os_version, :type + + attribute_mapping( + id: 'ai.device.id', + locale: 'ai.device.locale', + model: 'ai.device.model', + oem_name: 'ai.device.oemName', + os_version: 'ai.device.osVersion', + type: 'ai.device.type' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/domain.rb b/source/code/plugin/lib/application_insights/channel/contracts/domain.rb new file mode 100644 index 000000000..8a7ba880d --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/domain.rb @@ -0,0 +1,10 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Domain + include JsonSerializable + + attribute_mapping( + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/envelope.rb b/source/code/plugin/lib/application_insights/channel/contracts/envelope.rb new file mode 100644 index 000000000..b8608e388 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/envelope.rb @@ -0,0 +1,32 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Envelope + include JsonSerializable + + attr_accessor :ver, :name, :time, :sample_rate, :seq, :i_key, :tags, :data + + attribute_mapping( + ver: 'ver', + name: 'name', + time: 'time', + sample_rate: 'sampleRate', + seq: 'seq', + i_key: 'iKey', + tags: 'tags', + data: 'data' + ) + + def ver + @ver ||= 1 + end + + def sample_rate + @sample_rate ||= 100.0 + end + + def tags + @tags ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/event_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/event_data.rb new file mode 100644 index 000000000..4bfb16124 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/event_data.rb @@ -0,0 +1,28 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class EventData + include JsonSerializable + + attr_accessor :ver, :name, :properties, :measurements + + attribute_mapping( + ver: 'ver', + name: 'name', + properties: 'properties', + measurements: 'measurements' + ) + + def ver + @ver ||= 2 + end + + def properties + @properties ||= {} + end + + def measurements + @measurements ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/exception_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/exception_data.rb new file mode 100644 index 000000000..5cffd1253 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/exception_data.rb @@ -0,0 +1,35 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class ExceptionData + include JsonSerializable + + attr_accessor :ver, :exceptions, :severity_level, :problem_id, :properties, + :measurements + + attribute_mapping( + ver: 'ver', + exceptions: 'exceptions', + severity_level: 'severityLevel', + problem_id: 'problemId', + properties: 'properties', + measurements: 'measurements' + ) + + def ver + @ver ||= 2 + end + + def exceptions + @exceptions ||= [] + end + + def properties + @properties ||= {} + end + + def measurements + @measurements ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/exception_details.rb b/source/code/plugin/lib/application_insights/channel/contracts/exception_details.rb new file mode 100644 index 000000000..85bfc6282 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/exception_details.rb @@ -0,0 +1,28 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class ExceptionDetails + include JsonSerializable + + attr_accessor :id, :outer_id, :type_name, :message, :has_full_stack, :stack, + :parsed_stack + + attribute_mapping( + id: 'id', + outer_id: 'outerId', + type_name: 'typeName', + message: 'message', + has_full_stack: 'hasFullStack', + stack: 'stack', + parsed_stack: 'parsedStack' + ) + + def has_full_stack + @has_full_stack.nil? ? true : @has_full_stack + end + + def parsed_stack + @parsed_stack ||= [] + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/internal.rb b/source/code/plugin/lib/application_insights/channel/contracts/internal.rb new file mode 100644 index 000000000..6e8f3d300 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/internal.rb @@ -0,0 +1,15 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Internal + include JsonSerializable + + attr_accessor :sdk_version, :agent_version, :node_name + + attribute_mapping( + sdk_version: 'ai.internal.sdkVersion', + agent_version: 'ai.internal.agentVersion', + node_name: 'ai.internal.nodeName' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb b/source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb new file mode 100644 index 000000000..8f4677044 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb @@ -0,0 +1,59 @@ +require 'json' + +module ApplicationInsights + module Channel + module Contracts + module JsonSerializable + module ClassMethods + attr_reader :json_mappings + + def attribute_mapping(mappings = {}) + @json_mappings = mappings + end + end + + def self.included(klass) + klass.extend JsonSerializable::ClassMethods + end + + def initialize(attributes = {}) + attributes.each { |k, v| send(:"#{k}=", v) } + end + + def to_h + output = {} + klass = self.class + + klass.json_mappings.each do |attr, name| + value = visit self.send(attr) + is_empty = value.respond_to?(:empty?) && value.empty? + + output[name] = value unless value.nil? || is_empty + end + + output + end + + def to_json(args = {}) + JSON.generate self.to_h, args + end + + private + + def visit(object) + return if object.nil? + + if object.is_a? Array + object.map { |e| visit e } + elsif object.is_a? Hash + Hash[object.map { |k, v| [k, visit(v)] }] + elsif object.respond_to? :to_h + object.to_h + else + object + end + end + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/location.rb b/source/code/plugin/lib/application_insights/channel/contracts/location.rb new file mode 100644 index 000000000..4136c869b --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/location.rb @@ -0,0 +1,13 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Location + include JsonSerializable + + attr_accessor :ip + + attribute_mapping( + ip: 'ai.location.ip' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/message_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/message_data.rb new file mode 100644 index 000000000..1340f5ba7 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/message_data.rb @@ -0,0 +1,24 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class MessageData + include JsonSerializable + + attr_accessor :ver, :message, :severity_level, :properties + + attribute_mapping( + ver: 'ver', + message: 'message', + severity_level: 'severityLevel', + properties: 'properties' + ) + + def ver + @ver ||= 2 + end + + def properties + @properties ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/metric_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/metric_data.rb new file mode 100644 index 000000000..bcb5739d6 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/metric_data.rb @@ -0,0 +1,27 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class MetricData + include JsonSerializable + + attr_accessor :ver, :metrics, :properties + + attribute_mapping( + ver: 'ver', + metrics: 'metrics', + properties: 'properties' + ) + + def ver + @ver ||= 2 + end + + def metrics + @metrics ||= [] + end + + def properties + @properties ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/operation.rb b/source/code/plugin/lib/application_insights/channel/contracts/operation.rb new file mode 100644 index 000000000..c86dd111b --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/operation.rb @@ -0,0 +1,17 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Operation + include JsonSerializable + + attr_accessor :id, :name, :parent_id, :synthetic_source, :correlation_vector + + attribute_mapping( + id: 'ai.operation.id', + name: 'ai.operation.name', + parent_id: 'ai.operation.parentId', + synthetic_source: 'ai.operation.syntheticSource', + correlation_vector: 'ai.operation.correlationVector' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/page_view_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/page_view_data.rb new file mode 100644 index 000000000..d17dd2f79 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/page_view_data.rb @@ -0,0 +1,33 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class PageViewData + include JsonSerializable + + attr_accessor :ver, :url, :name, :duration, :id, :referrer_uri, :properties, + :measurements + + attribute_mapping( + ver: 'ver', + url: 'url', + name: 'name', + duration: 'duration', + id: 'id', + referrer_uri: 'referrerUri', + properties: 'properties', + measurements: 'measurements' + ) + + def ver + @ver ||= 2 + end + + def properties + @properties ||= {} + end + + def measurements + @measurements ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb new file mode 100644 index 000000000..adde3f3ad --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb @@ -0,0 +1,39 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class PageViewPerfData + include JsonSerializable + + attr_accessor :ver, :url, :perf_total, :name, :duration, :network_connect, + :sent_request, :received_response, :id, :dom_processing, :referrer_uri, + :properties, :measurements + + attribute_mapping( + ver: 'ver', + url: 'url', + perf_total: 'perfTotal', + name: 'name', + duration: 'duration', + network_connect: 'networkConnect', + sent_request: 'sentRequest', + received_response: 'receivedResponse', + id: 'id', + dom_processing: 'domProcessing', + referrer_uri: 'referrerUri', + properties: 'properties', + measurements: 'measurements' + ) + + def ver + @ver ||= 2 + end + + def properties + @properties ||= {} + end + + def measurements + @measurements ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb new file mode 100644 index 000000000..a238841f6 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb @@ -0,0 +1,40 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class RemoteDependencyData + include JsonSerializable + + attr_accessor :ver, :name, :id, :result_code, :duration, :success, :data, + :target, :type, :properties, :measurements + + attribute_mapping( + ver: 'ver', + name: 'name', + id: 'id', + result_code: 'resultCode', + duration: 'duration', + success: 'success', + data: 'data', + target: 'target', + type: 'type', + properties: 'properties', + measurements: 'measurements' + ) + + def ver + @ver ||= 2 + end + + def success + @success.nil? ? true : @success + end + + def properties + @properties ||= {} + end + + def measurements + @measurements ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/reopenings.rb b/source/code/plugin/lib/application_insights/channel/contracts/reopenings.rb new file mode 100644 index 000000000..394bf8afb --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/reopenings.rb @@ -0,0 +1,27 @@ +module ApplicationInsights::Channel::Contracts + class ExceptionData + def handled_at + @properties["handledAt"] if @properties + end + + def handled_at=(handled_at) + if handled_at + @properties ||= {} + @properties["handledAt"] = handled_at + end + end + end + + class RequestData + def http_method + @properties["httpMethod"] if @properties + end + + def http_method=(http_method) + if http_method + @properties ||= {} + @properties["httpMethod"] = http_method + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/lib/application_insights/channel/contracts/request_data.rb b/source/code/plugin/lib/application_insights/channel/contracts/request_data.rb new file mode 100644 index 000000000..af2581c2b --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/request_data.rb @@ -0,0 +1,35 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class RequestData + include JsonSerializable + + attr_accessor :ver, :id, :source, :name, :duration, :response_code, :success, + :url, :properties, :measurements + + attribute_mapping( + ver: 'ver', + id: 'id', + source: 'source', + name: 'name', + duration: 'duration', + response_code: 'responseCode', + success: 'success', + url: 'url', + properties: 'properties', + measurements: 'measurements' + ) + + def ver + @ver ||= 2 + end + + def properties + @properties ||= {} + end + + def measurements + @measurements ||= {} + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/session.rb b/source/code/plugin/lib/application_insights/channel/contracts/session.rb new file mode 100644 index 000000000..a761c51c5 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/session.rb @@ -0,0 +1,14 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class Session + include JsonSerializable + + attr_accessor :id, :is_first + + attribute_mapping( + id: 'ai.session.id', + is_first: 'ai.session.isFirst' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/severity_level.rb b/source/code/plugin/lib/application_insights/channel/contracts/severity_level.rb new file mode 100644 index 000000000..322a00ec3 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/severity_level.rb @@ -0,0 +1,13 @@ +module ApplicationInsights::Channel::Contracts + class SeverityLevel + VERBOSE = 0 + + INFORMATION = 1 + + WARNING = 2 + + ERROR = 3 + + CRITICAL = 4 + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/stack_frame.rb b/source/code/plugin/lib/application_insights/channel/contracts/stack_frame.rb new file mode 100644 index 000000000..b4f4b9844 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/stack_frame.rb @@ -0,0 +1,17 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class StackFrame + include JsonSerializable + + attr_accessor :level, :method, :assembly, :file_name, :line + + attribute_mapping( + level: 'level', + method: 'method', + assembly: 'assembly', + file_name: 'fileName', + line: 'line' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/user.rb b/source/code/plugin/lib/application_insights/channel/contracts/user.rb new file mode 100644 index 000000000..a7ff8a7cf --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/contracts/user.rb @@ -0,0 +1,15 @@ +require_relative 'json_serializable' + +module ApplicationInsights::Channel::Contracts + class User + include JsonSerializable + + attr_accessor :account_id, :id, :auth_user_id + + attribute_mapping( + account_id: 'ai.user.accountId', + id: 'ai.user.id', + auth_user_id: 'ai.user.authUserId' + ) + end +end diff --git a/source/code/plugin/lib/application_insights/channel/event.rb b/source/code/plugin/lib/application_insights/channel/event.rb new file mode 100644 index 000000000..ae61064f8 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/event.rb @@ -0,0 +1,68 @@ +require_relative 'queue_base' +require 'thread' + +module ApplicationInsights + module Channel + # An event class that allows simple cross-thread signalling. + # + # An object of this type managers an internal flag that can be set to true + # via the {#set} method and reset via the {#clear} method. Calling the + # {#wait} method will block until the flag is set to true. + # + # @example + # require 'application_insights' + # require 'thread' + # event = ApplicationInsights::Channel::Event.new + # Thread.new do + # sleep 1 + # event.set + # end + # puts 'Main screen turn on.' + # result = event.wait + # puts 'All your base are belong to us.' + class Event + # Initializes a new instance of the class. + def initialize + @mutex = Mutex.new + @condition_variable = ConditionVariable.new + @signal = false + end + + # The signal value for this object. Note that the value of this property is + # not synchronized with respect to {#set} and {#clear} meaning that it + # could return false positives or negatives. + # @return [Boolean] the signal value. + attr_reader :signal + + # Sets the internal flag to true. Calling this method will also cause all + # waiting threads to awaken. + def set + @mutex.synchronize do + @signal = true + @condition_variable.broadcast + end + end + + # Sets the internal flag to false. + def clear + @mutex.synchronize do + @signal = false + end + end + + # Calling this method will block until the internal flag is set to true. + # If the flag is set to true before calling this method, we will return + # immediately. If the timeout parameter is specified, the method will + # unblock after the specified number of seconds. + # @param [Fixnum] timeout the timeout for the operation in seconds. + # @return [Boolean] the value of the internal flag on exit. + def wait(timeout=nil) + @mutex.synchronize do + @condition_variable.wait(@mutex, timeout) unless @signal + end + + @signal + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/queue_base.rb b/source/code/plugin/lib/application_insights/channel/queue_base.rb new file mode 100644 index 000000000..91226b17f --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/queue_base.rb @@ -0,0 +1,73 @@ +require 'thread' + +module ApplicationInsights + module Channel + # The base class for all types of queues for use in conjunction with an + # implementation of {SenderBase}. The queue will notify the sender that it + # needs to pick up items when it reaches {#max_queue_length}, or when the + # consumer calls {#flush}. + class QueueBase + # Initializes a new instance of the class. + # @param [SenderBase] sender the sender object that will be used in + # conjunction with this queue. + def initialize(sender) + @queue = Queue.new + @max_queue_length = 500 + self.sender = sender + end + + # The maximum number of items that will be held by the queue before the + # queue will call the {#flush} method. + # @return [Fixnum] the maximum queue size. (defaults to: 500) + attr_accessor :max_queue_length + + # The sender that is associated with this queue that this queue will use to + # send data to the service. + # @return [SenderBase] the sender object. + attr_reader :sender + + # Change the sender that is associated with this queue. + # @param [SenderBase] sender the sender object. + # @return [SenderBase] the sender object. + def sender=(sender) + @sender = sender + @sender.queue = self if sender + @sender + end + + # Adds the passed in item object to the queue and calls {#flush} if the + # size of the queue is larger than {#max_queue_length}. This method does + # nothing if the passed in item is nil. + # @param [Contracts::Envelope] item the telemetry envelope object to send + # to the service. + def push(item) + return unless item + + @queue.push(item) + + flush if @queue.length >= @max_queue_length + end + + # Pops a single item from the queue and returns it. If the queue is empty, + # this method will return nil. + # @return [Contracts::Envelope] a telemetry envelope object or nil if the + # queue is empty. + def pop + return @queue.pop(true) + rescue ThreadError + return nil + end + + # Flushes the current queue by notifying the {#sender}. This method needs + # to be overridden by a concrete implementations of the queue class. + def flush + end + + # Indicates whether the queue is empty. + # @return [Boolean] true if the queue is empty + def empty? + @queue.empty? + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/sender_base.rb b/source/code/plugin/lib/application_insights/channel/sender_base.rb new file mode 100644 index 000000000..2431bf748 --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/sender_base.rb @@ -0,0 +1,88 @@ +require 'json' +require 'net/http' +require 'openssl' +require 'stringio' +require 'zlib' +require 'logger' + +module ApplicationInsights + module Channel + # The base class for all types of senders for use in conjunction with an + # implementation of {QueueBase}. The queue will notify the sender that it + # needs to pick up items. The concrete sender implementation will listen to + # these notifications and will pull items from the queue using + # {QueueBase#pop} getting at most {#send_buffer_size} items. + # It will then call {#send} using the list of items pulled from the queue. + class SenderBase + # Initializes a new instance of the class. + # @param [String] service_endpoint_uri the address of the service to send + # telemetry data to. + def initialize(service_endpoint_uri) + @service_endpoint_uri = service_endpoint_uri + @queue = nil + @send_buffer_size = 100 + @logger = Logger.new(STDOUT) + end + + # The service endpoint URI where this sender will send data to. + # @return [String] the service endpoint URI. + attr_accessor :service_endpoint_uri + + # The queue that this sender is draining. While {SenderBase} doesn't + # implement any means of doing so, derivations of this class do. + # @return [QueueBase] the queue instance that this sender is draining. + attr_accessor :queue + + # The buffer size for a single batch of telemetry. This is the maximum number + # of items in a single service request that this sender is going to send. + # @return [Fixnum] the maximum number of items in a telemetry batch. + attr_accessor :send_buffer_size + + # The logger for the sender. + attr_accessor :logger + + # Immediately sends the data passed in to {#service_endpoint_uri}. If the + # service request fails, the passed in items are pushed back to the {#queue}. + # @param [Array] data_to_send an array of + # {Contracts::Envelope} objects to send to the service. + def send(data_to_send) + uri = URI(@service_endpoint_uri) + headers = { + 'Accept' => 'application/json', + 'Content-Type' => 'application/json; charset=utf-8', + 'Content-Encoding' => 'gzip' + } + request = Net::HTTP::Post.new(uri.path, headers) + + # Use JSON.generate instead of to_json, otherwise it will + # default to ActiveSupport::JSON.encode for Rails app + json = JSON.generate(data_to_send) + compressed_data = compress(json) + request.body = compressed_data + + http = Net::HTTP.new uri.hostname, uri.port + if uri.scheme.downcase == 'https' + http.use_ssl = true + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + end + + response = http.request(request) + http.finish if http.started? + + if !response.kind_of? Net::HTTPSuccess + @logger.warn('application_insights') { "Failed to send data: #{response.message}" } + end + end + + private + + def compress(string) + wio = StringIO.new("w") + w_gz = Zlib::GzipWriter.new wio, nil, nil + w_gz.write(string) + w_gz.close + wio.string + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/synchronous_queue.rb b/source/code/plugin/lib/application_insights/channel/synchronous_queue.rb new file mode 100644 index 000000000..13c2281ac --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/synchronous_queue.rb @@ -0,0 +1,45 @@ +require_relative 'queue_base' + +module ApplicationInsights + module Channel + # A synchronous queue for use in conjunction with the {SynchronousSender}. + # The queue will call {SenderBase#send} when it reaches {#max_queue_length}, + # or when the consumer calls {#flush}. + # + # @example + # require 'application_insights' + # require 'thread' + # queue = ApplicationInsights::Channel::SynchronousQueue.new nil + # queue.max_queue_length = 1 + # queue.push 1 + class SynchronousQueue < QueueBase + # Initializes a new instance of the class. + # @param [SenderBase] sender the sender object that will be used in + # conjunction with this queue. + def initialize(sender) + super sender + end + + # Flushes the current queue by by calling {#sender}'s + # {SenderBase#send} method. + def flush + local_sender = @sender + return unless local_sender + + while true + # get at most send_buffer_size items and send them + data = [] + while data.length < local_sender.send_buffer_size + item = pop() + break if not item + data.push item + end + + break if data.length == 0 + + local_sender.send(data) + end + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/synchronous_sender.rb b/source/code/plugin/lib/application_insights/channel/synchronous_sender.rb new file mode 100644 index 000000000..ade2f086c --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/synchronous_sender.rb @@ -0,0 +1,17 @@ +require_relative 'sender_base' + +module ApplicationInsights + module Channel + # A synchronous sender that works in conjunction with the {SynchronousQueue}. + # The queue will call {#send} on the current instance with the data to send. + class SynchronousSender < SenderBase + SERVICE_ENDPOINT_URI = 'https://dc.services.visualstudio.com/v2/track' + # Initializes a new instance of the class. + # @param [String] service_endpoint_uri the address of the service to send + # telemetry data to. + def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI) + super service_endpoint_uri + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/telemetry_channel.rb b/source/code/plugin/lib/application_insights/channel/telemetry_channel.rb new file mode 100644 index 000000000..e026ebf7d --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/telemetry_channel.rb @@ -0,0 +1,131 @@ +require 'time' +require_relative 'asynchronous_queue' +require_relative 'asynchronous_sender' +require_relative 'telemetry_context' +require_relative 'synchronous_queue' +require_relative 'synchronous_sender' +require_relative 'contracts/envelope' +require_relative 'contracts/data' +require_relative 'contracts/internal' +require_relative '../../application_insights/version' + +module ApplicationInsights + module Channel + # The telemetry channel is responsible for constructing a + # {Contracts::Envelope} object from the passed in data and specified + # telemetry context. + # + # @example + # require 'application_insights' + # channel = ApplicationInsights::Channel::TelemetryChannel.new + # event = ApplicationInsights::Channel::Contracts::EventData.new name: 'My event' + # channel.write event + class TelemetryChannel + # Initializes a new instance of the class. + # @param [TelemetryContext] context the telemetry context to use when + # sending telemetry data. + # @param [QueueBase] queue the queue to enqueue the resulting + # {Contracts::Envelope} to. + def initialize(context=nil, queue=nil) + @context = context || TelemetryContext.new + @queue = queue || SynchronousQueue.new(SynchronousSender.new) + end + + # The context associated with this channel. All {Contracts::Envelope} + # objects created by this channel will use this value if it's present or if + # none is specified as part of the {#write} call. + # @return [TelemetryContext] the context instance + # (defaults to: TelemetryContext.new) + attr_reader :context + + # The queue associated with this channel. All {Contracts::Envelope} objects + # created by this channel will be pushed to this queue. + # @return [QueueBase] the queue instance (defaults to: SynchronousQueue.new) + attr_reader :queue + + # The sender associated with this channel. This instance will be used to + # transmit telemetry to the service. + # @return [SenderBase] the sender instance (defaults to: SynchronousSender.new) + def sender + @queue.sender + end + + # Flushes the enqueued data by calling {QueueBase#flush}. + def flush + @queue.flush + end + + # Enqueues the passed in data to the {#queue}. If the caller specifies a + # context as well, it will take precedence over the instance in {#context}. + # @param [Object] data the telemetry data to send. This will be wrapped in + # an {Contracts::Envelope} before being enqueued to the {#queue}. + # @param [TelemetryContext] context the override context to use when + # constructing the {Contracts::Envelope}. + # @param [Time|String] time the timestamp of the telemetry used to construct the + # {Contracts::Envelope}. + def write(data, context=nil, time=nil) + local_context = context || @context + raise ArgumentError, 'Context was required but not provided' unless local_context + + if time && time.is_a?(String) + local_time = time + elsif time && time.is_a?(Time) + local_time = time.iso8601(7) + else + local_time = Time.now.iso8601(7) + end + + data_type = data.class.name.gsub(/^.*::/, '') + set_properties data, local_context + data_attributes = { + :base_type => data_type, + :base_data => data + } + envelope_attributes = { + :name => 'Microsoft.ApplicationInsights.' + data_type[0..-5], + :time => local_time, + :i_key => local_context.instrumentation_key, + :tags => get_tags(local_context), + :data => Contracts::Data.new(data_attributes) + } + envelope = Contracts::Envelope.new envelope_attributes + @queue.push(envelope) + end + + private + + def get_tags(context) + hash = {} + internal_context_attributes = { + :sdk_version => 'rb:' + ApplicationInsights::VERSION + } + internal_context = Contracts::Internal.new internal_context_attributes + + [internal_context, + context.application, + context.cloud, + context.device, + context.user, + context.session, + context.location, + context.operation].each { |c| hash.merge!(c.to_h) if c } + + hash.delete_if { |k, v| v.nil? } + + hash + end + + def set_properties(data, context) + if context.properties + properties = data.properties || {} + context.properties.each do |key, value| + unless properties.key?(key) + properties[key] = value + end + end + data.properties = properties + end + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/channel/telemetry_context.rb b/source/code/plugin/lib/application_insights/channel/telemetry_context.rb new file mode 100644 index 000000000..bb24af24e --- /dev/null +++ b/source/code/plugin/lib/application_insights/channel/telemetry_context.rb @@ -0,0 +1,85 @@ +require_relative 'contracts/application' +require_relative 'contracts/cloud' +require_relative 'contracts/device' +require_relative 'contracts/user' +require_relative 'contracts/session' +require_relative 'contracts/operation' +require_relative 'contracts/location' + +module ApplicationInsights + module Channel + # Represents the context for sending telemetry to the + # Application Insights service. + # + # @example + # require 'application_insights' + # context = ApplicationInsights::Channel::TelemetryContext.new + # context.instrumentation_key = '' + # context.application.id = 'My application' + # context.application.ver = '1.2.3' + # context.device.id = 'My current device' + # context.device.oem_name = 'Asus' + # context.device.model = 'X31A' + # context.device.type = "Other" + # context.user.id = 'santa@northpole.net' + class TelemetryContext + # Initializes a new instance of the class. + def initialize + @instrumentation_key = nil + @application = Contracts::Application.new + @cloud = Contracts::Cloud.new + @device = Contracts::Device.new + @user = Contracts::User.new + @session = Contracts::Session.new + @operation = Contracts::Operation.new + @location = Contracts::Location.new + @properties = {} + end + + # The instrumentation key that is used to identify which + # Application Insights application this data is for. + # @return [String] the instrumentation key. + attr_accessor :instrumentation_key + + # The application context. This contains properties of the + # application you are running. + # @return [Contracts::Application] the context object. + attr_accessor :application + + # The cloud context. This contains properties of the + # cloud role you are generating telemetry for. + # @return [Contracts::Cloud] the context object. + attr_accessor :cloud + + # The device context. This contains properties of the + # device you are running on. + # @return [Contracts::Device] the context object. + attr_accessor :device + + # The user context. This contains properties of the + # user you are generating telemetry for. + # @return [Contracts::User] the context object. + attr_accessor :user + + # The session context. This contains properties of the + # session you are generating telemetry for. + # @return [Contracts::Session] the context object. + attr_accessor :session + + # The operation context. This contains properties of the + # operation you are generating telemetry for. + # @return [Contracts::Operation] the context object. + attr_accessor :operation + + # The location context. This contains properties of the + # location you are generating telemetry from. + # @return [Contracts::Location] the context object. + attr_accessor :location + + # The property context. This contains free-form properties + # that you can add to your telemetry. + # @return [Hash] the context object. + attr_accessor :properties + end + end +end diff --git a/source/code/plugin/lib/application_insights/rack/track_request.rb b/source/code/plugin/lib/application_insights/rack/track_request.rb new file mode 100644 index 000000000..62c2b0844 --- /dev/null +++ b/source/code/plugin/lib/application_insights/rack/track_request.rb @@ -0,0 +1,154 @@ +require 'rack' +require 'securerandom' +require_relative '../channel/contracts/request_data' +require_relative '../telemetry_client' + +module ApplicationInsights + module Rack + # Track every request and sends the request data to Application Insights. + class TrackRequest + # Initializes a new instance of the class. + # @param [Object] app the inner rack application. + # @param [String] instrumentation_key to identify which Application Insights + # application this data is for. + # @param [Fixnum] buffer_size the buffer size and the buffered requests would + # send to Application Insights when buffer is full. + # @param [Fixnum] send_interval the frequency (in seconds) to check buffer + # and send buffered requests to Application Insights if any. + def initialize(app, instrumentation_key, buffer_size = 500, send_interval = 60) + @app = app + @instrumentation_key = instrumentation_key + @buffer_size = buffer_size + @send_interval = send_interval + + @sender = Channel::AsynchronousSender.new + @sender.send_interval = @send_interval + queue = Channel::AsynchronousQueue.new @sender + queue.max_queue_length = @buffer_size + @channel = Channel::TelemetryChannel.new nil, queue + + @client = TelemetryClient.new @instrumentation_key, @channel + end + + # Track requests and send data to Application Insights asynchronously. + # @param [Hash] env the rack environment. + def call(env) + # Build a request ID, incorporating one from our request if one exists. + request_id = request_id_header(env['HTTP_REQUEST_ID']) + env['ApplicationInsights.request.id'] = request_id + + start = Time.now + begin + status, headers, response = @app.call(env) + rescue Exception => ex + status = 500 + exception = ex + end + stop = Time.now + + start_time = start.iso8601(7) + duration = format_request_duration(stop - start) + success = status.to_i < 400 + + request = ::Rack::Request.new env + options = options_hash(request) + + data = request_data(request_id, start_time, duration, status, success, options) + context = telemetry_context(request_id, env['HTTP_REQUEST_ID']) + + @client.channel.write data, context, start_time + + if exception + @client.track_exception exception, handled_at: 'Unhandled' + raise exception + end + + [status, headers, response] + end + + private + + def sender=(sender) + if sender.is_a? Channel::AsynchronousSender + @sender = sender + @client.channel.queue.sender = @sender + end + end + + def client + @client + end + + def format_request_duration(duration_seconds) + if duration_seconds >= 86400 + # just return 1 day when it takes more than 1 day which should not happen for requests. + return "%02d.%02d:%02d:%02d.%07d" % [1, 0, 0, 0, 0] + end + + Time.at(duration_seconds).gmtime.strftime("00.%H:%M:%S.%7N") + end + + def request_id_header(request_id) + valid_request_id_header = valid_request_id(request_id) + + length = valid_request_id_header ? 5 : 10 + id = SecureRandom.base64(length) + + if valid_request_id_header + request_id_has_end = %w[. _].include?(request_id[-1]) + request_id << '.' unless request_id_has_end + + return "#{request_id}#{id}_" + end + + "|#{id}." + end + + def valid_request_id(request_id) + request_id && request_id[0] == '|' + end + + def operation_id(id) + # Returns the root ID from the '|' to the first '.' if any. + root_start = id[0] == '|' ? 1 : 0 + + root_end = id.index('.') + root_end = root_end ? root_end - 1 : id.length - root_start + + id[root_start..root_end] + end + + def options_hash(request) + { + name: "#{request.request_method} #{request.path}", + http_method: request.request_method, + url: request.url + } + end + + def request_data(request_id, start_time, duration, status, success, options) + Channel::Contracts::RequestData.new( + :id => request_id || 'Null', + :duration => duration || '0:00:00:00.0000000', + :response_code => status || 200, + :success => success == nil ? true : success, + :name => options[:name], + :url => options[:url], + :properties => options[:properties] || {}, + :measurements => options[:measurements] || {}, + # Must initialize http_method after properties because it's actually stored in properties + :http_method => options[:http_method] + ) + end + + def telemetry_context(request_id, request_id_header) + context = Channel::TelemetryContext.new + context.instrumentation_key = @instrumentation_key + context.operation.id = operation_id(request_id) + context.operation.parent_id = request_id_header + + context + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/telemetry_client.rb b/source/code/plugin/lib/application_insights/telemetry_client.rb new file mode 100644 index 000000000..bd066ae70 --- /dev/null +++ b/source/code/plugin/lib/application_insights/telemetry_client.rb @@ -0,0 +1,232 @@ +require_relative 'channel/telemetry_context' +require_relative 'channel/telemetry_channel' +require_relative 'channel/contracts/page_view_data' +require_relative 'channel/contracts/remote_dependency_data' +require_relative 'channel/contracts/exception_data' +require_relative 'channel/contracts/exception_details' +require_relative 'channel/contracts/event_data' +require_relative 'channel/contracts/data_point' +require_relative 'channel/contracts/data_point_type' +require_relative 'channel/contracts/metric_data' +require_relative 'channel/contracts/message_data' +require_relative 'channel/contracts/stack_frame' +require_relative 'channel/contracts/request_data' +require_relative 'channel/contracts/severity_level' +require_relative 'channel/contracts/reopenings' + +module ApplicationInsights + # The telemetry client used for sending all types of telemetry. It serves as + # the main entry point for interacting with the Application Insights service. + class TelemetryClient + # Initializes a new instance of the class. + # @param [String] instrumentation_key to identify which Application Insights + # application this data is for. + # @param [Channel::TelemetryChannel] telemetry_channel the optional telemetry + # channel to be used instead of constructing a default one. + def initialize(instrumentation_key = nil, telemetry_channel = nil) + @context = Channel::TelemetryContext.new + @context.instrumentation_key = instrumentation_key + @channel = telemetry_channel || Channel::TelemetryChannel.new + end + + # The context associated with this client. All data objects created by this + # client will be accompanied by this value. + # @return [Channel::TelemetryContext] the context instance. + attr_reader :context + + # The channel associated with this telemetry client. All data created by this + # client will be passed along with the {#context} object to + # {Channel::TelemetryChannel#write} + # @return [Channel::TelemetryChannel] the channel instance. + attr_reader :channel + + # Send information about the page viewed in the application (a web page for + # instance). + # @param [String] name the name of the page that was viewed. + # @param [String] url the URL of the page that was viewed. + # @param [Hash] options the options to create the + # {Channel::Contracts::PageViewData} object. + # @option options [Fixnum] :duration the duration of the page view in + # milliseconds. (defaults to: 0) + # @option options [Hash] :properties the set of custom properties the client + # wants attached to this data item. (defaults to: {}) + # @option options [Hash] :measurements the set of custom measurements the + # client wants to attach to this data item (defaults to: {}) + def track_page_view(name, url, options={}) + data_attributes = { + :name => name || 'Null', + :url => url, + :duration => options[:duration], + :properties => options[:properties] || {}, + :measurements => options[:measurements] || {} + } + data = Channel::Contracts::PageViewData.new data_attributes + self.channel.write(data, self.context) + end + + # Send information about a single exception that occurred in the application. + # @param [Exception] exception the exception that the client wants to send. + # @param [Hash] options the options to create the + # {Channel::Contracts::ExceptionData} object. + # @option options [String] :handled_at the type of exception + # (defaults to: 'UserCode') + # @option options [Hash] :properties the set of custom properties the client + # wants attached to this data item. (defaults to: {}) + # @option options [Hash] :measurements the set of custom measurements the + # client wants to attach to this data item (defaults to: {}) + def track_exception(exception, options={}) + return unless exception.is_a? Exception + + parsed_stack = [] + if exception.backtrace + frame_pattern = /^(?.*):(?\d+)(\.|:in `((?.*)'$))/ + + exception.backtrace.each_with_index do |frame, counter| + match = frame_pattern.match frame + stack_frame = Channel::Contracts::StackFrame.new( + :assembly => 'Unknown', + :file_name => match['file'], + :level => counter, + :line => match['line'], + :method => match['method'] + ) + + parsed_stack << stack_frame + end + end + + details = Channel::Contracts::ExceptionDetails.new( + :id => 1, + :outer_id => 0, + :type_name => exception.class.name, + :message => exception.message, + :has_full_stack => exception.backtrace != nil, + :stack => (exception.backtrace.join("\n") if exception.backtrace), + :parsed_stack => parsed_stack + ) + + data = Channel::Contracts::ExceptionData.new( + :exceptions => [details], + :properties => options[:properties] || {}, + :measurements => options[:measurements] || {}, + # Must initialize handled_at after properties because it's actually stored in properties + :handled_at => options.fetch(:handled_at, 'UserCode') + ) + + self.channel.write(data, self.context) + end + + # Send information about a single event that has occurred in the context of + # the application. + # @param [String] name the data to associate to this event. + # @param [Hash] options the options to create the + # {Channel::Contracts::EventData} object. + # @option options [Hash] :properties the set of custom properties the client + # wants attached to this data item. (defaults to: {}) + # @option options [Hash] :measurements the set of custom measurements the + # client wants to attach to this data item (defaults to: {}) + def track_event(name, options={}) + data = Channel::Contracts::EventData.new( + :name => name || 'Null', + :properties => options[:properties] || {}, + :measurements => options[:measurements] || {} + ) + + self.channel.write(data, self.context) + end + + # Send information about a single metric data point that was captured for + # the application. + # @param [String] name the name of the metric that was captured. + # @param [Fixnum] value the value of the metric that was captured. + # @param [Hash] options the options to create the + # {Channel::Contracts::MetricData} object. + # @option options [Channel::Contracts::DataPointType] :type the type of the + # metric (defaults to: {Channel::Contracts::DataPointType::AGGREGATION}) + # @option options [Fixnum] :count the number of metrics that were aggregated + # into this data point (defaults to: 0) + # @option options [Fixnum] :min the minimum of all metrics collected that + # were aggregated into this data point (defaults to: 0) + # @option options [Fixnum] :max the maximum of all metrics collected that + # were aggregated into this data point (defaults to: 0) + # @option options [Fixnum] :std_dev the standard deviation of all metrics + # collected that were aggregated into this data point (defaults to: 0) + # @option options [Hash] :properties the set of custom properties the client + # wants attached to this data item. (defaults to: {}) + # @option options [Hash] :measurements the set of custom measurements the + # client wants to attach to this data item (defaults to: {}) + def track_metric(name, value, options={}) + data_point = Channel::Contracts::DataPoint.new( + :name => name || 'Null', + :value => value || 0, + :kind => options[:type] || Channel::Contracts::DataPointType::AGGREGATION, + :count => options[:count], + :min => options[:min], + :max => options[:max], + :std_dev => options[:std_dev] + ) + + data = Channel::Contracts::MetricData.new( + :metrics => [data_point], + :properties => options[:properties] || {} + ) + + self.channel.write(data, self.context) + end + + # Sends a single trace statement. + # @param [String] name the trace statement. + # @param [Channel::Contracts::SeverityLevel] severity_level the severity level. + # @param [Hash] options the options to create the + # {Channel::Contracts::EventData} object. + # @option options [Hash] :properties the set of custom properties the client + # wants attached to this data item. (defaults to: {}) + def track_trace(name, severity_level = nil, options={}) + data = Channel::Contracts::MessageData.new( + :message => name || 'Null', + :severity_level => severity_level || Channel::Contracts::SeverityLevel::INFORMATION, + :properties => options[:properties] || {} + ) + + self.channel.write(data, self.context) + end + + # Sends a single request. + # @param [String] id the unique identifier of the request. + # @param (String) start_time the start time of the request. + # @param [String] duration the duration to process the request. + # @param [String] response_code the response code of the request. + # @param [Boolean] success indicates whether the request succeeds or not. + # @param [Hash] options the options to create the + # {Channel::Contracts::RequestData} object. + # @option options [String] :name the name of the request. + # @option options [String] :http_method the http method used for the request. + # @option options [String] :url the url of the request. + # @option options [Hash] :properties the set of custom properties the client + # wants attached to this data item. (defaults to: {}) + # @option options [Hash] :measurements the set of custom measurements the + # client wants to attach to this data item (defaults to: {}) + def track_request(id, start_time, duration, response_code, success, options={}) + data = Channel::Contracts::RequestData.new( + :id => id || 'Null', + :duration => duration || '0:00:00:00.0000000', + :response_code => response_code || 200, + :success => success = nil ? true : success, + :name => options[:name], + :url => options[:url], + :properties => options[:properties] || {}, + :measurements => options[:measurements] || {}, + # Must initialize http_method after properties because it's actually stored in properties + :http_method => options[:http_method] + ) + + self.channel.write(data, self.context, start_time) + end + + # Flushes data in the queue. Data in the queue will be sent either immediately + # irrespective of what sender is being used. + def flush + self.channel.flush + end + end +end diff --git a/source/code/plugin/lib/application_insights/unhandled_exception.rb b/source/code/plugin/lib/application_insights/unhandled_exception.rb new file mode 100644 index 000000000..aa87b6f85 --- /dev/null +++ b/source/code/plugin/lib/application_insights/unhandled_exception.rb @@ -0,0 +1,49 @@ +require_relative 'telemetry_client' +require_relative 'channel/telemetry_channel' +require_relative 'channel/synchronous_queue' +require_relative 'channel/synchronous_sender' + +include ApplicationInsights + +module ApplicationInsights + module UnhandledException + @sender = nil + + # Auto collects unhandled exception and send to the Application Insights service. + # @param (string) instrumentation_key used to identify which Application + # Insights application this data is for. + # @example + # require 'application_insights' + # ApplicationInsights::UnhandledException.collect('') + # raise Exception, 'Boom!' + def self.collect(instrumentation_key) + at_exit do + # Avoid sending exception more than once if this method got invoked multiple times + send(instrumentation_key) unless @sender + end + end + + # @api private + # Send the last raised exception to the Application Insights service if + # telemetry_sender is not customized. + # @param (string) instrumentation_key used to identify which Application + # Insights application this data is for. + # @param (SenderBase) telemetry_sender used to send the last raised exception. + def self.send(instrumentation_key, telemetry_sender = nil) + if $! && !$!.is_a?(SystemExit) && !$!.is_a?(SignalException) + if telemetry_sender + @sender = telemetry_sender + elsif !@sender + # Use a synchronized sender to guarantee the data would be sent out once flush + @sender = Channel::SynchronousSender.new + end + + queue = Channel::SynchronousQueue.new @sender + channel = Channel::TelemetryChannel.new nil, queue + client = TelemetryClient.new instrumentation_key, channel + client.track_exception($!, handled_at: 'Unhandled') + client.flush + end + end + end +end diff --git a/source/code/plugin/lib/application_insights/version.rb b/source/code/plugin/lib/application_insights/version.rb new file mode 100644 index 000000000..d2d56e833 --- /dev/null +++ b/source/code/plugin/lib/application_insights/version.rb @@ -0,0 +1,3 @@ +module ApplicationInsights + VERSION = '0.5.7'.freeze +end From 6698fcd365328f31b7cbda6fec205cec1ef7933c Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 7 Nov 2018 16:21:53 -0800 Subject: [PATCH 030/160] Fix Telemetry Bug -- Initialize Telemetry Client after Initializing all required properties (#162) --- source/code/go/src/plugins/oms.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index e0abaea1f..51a2bd47e 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -334,13 +334,6 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { ImageIDMap = make(map[string]string) NameIDMap = make(map[string]string) - ret, err := InitializeTelemetryClient(agentVersion) - if ret != 0 || err != nil { - message := fmt.Sprintf("Error During Telemetry Initialization :%s", err.Error()) - fmt.Printf(message) - Log(message) - } - pluginConfig, err := ReadConfiguration(pluginConfPath) if err != nil { message := fmt.Sprintf("Error Reading plugin config path : %s \n", err.Error()) @@ -398,6 +391,13 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Computer = strings.TrimSuffix(ToString(containerHostName), "\n") Log("Computer == %s \n", Computer) + ret, err := InitializeTelemetryClient(agentVersion) + if ret != 0 || err != nil { + message := fmt.Sprintf("Error During Telemetry Initialization :%s", err.Error()) + fmt.Printf(message) + Log(message) + } + // Initialize KubeAPI Client config, err := rest.InClusterConfig() if err != nil { From ad6bb933f64c7d32c3eb779d031327c76e12d2e4 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 12 Nov 2018 11:45:57 -0800 Subject: [PATCH 031/160] Fix kube events memory leak due to yaml serialization for > 5k events (#163) --- source/code/plugin/in_kube_events.rb | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 6a6ae9296..5df31df95 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -10,7 +10,6 @@ class Kube_Event_Input < Input def initialize super - require 'yaml' require 'json' require_relative 'KubernetesApiClient' @@ -62,6 +61,7 @@ def enumerate(eventList = nil) eventStream = MultiEventStream.new events['items'].each do |items| record = {} + # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated eventId = items['metadata']['uid'] + "/" + items['count'].to_s newEventQueryState.push(eventId) @@ -86,7 +86,7 @@ def enumerate(eventList = nil) end record['ClusterName'] = KubernetesApiClient.getClusterName record['ClusterId'] = KubernetesApiClient.getClusterId - eventStream.add(emitTime, record) if record + eventStream.add(emitTime, record) if record end router.emit_stream(@tag, eventStream) if eventStream end @@ -121,7 +121,10 @@ def getEventQueryState eventQueryState = [] begin if File.file?(@@KubeEventsStateFile) - eventQueryState = YAML.load_file(@@KubeEventsStateFile, []) + # Do not read the entire file in one shot as it spikes memory (50+MB) for ~5k events + File.foreach(@@KubeEventsStateFile) do |line| + eventQueryState.push(line.chomp) #puts will append newline which needs to be removed + end end rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s @@ -132,7 +135,12 @@ def getEventQueryState def writeEventQueryState(eventQueryState) begin - File.write(@@KubeEventsStateFile, eventQueryState.to_yaml) + if(!eventQueryState.nil? && !eventQueryState.empty?) + # No need to close file handle (f) due to block scope + File.open(@@KubeEventsStateFile, "w") do |f| + f.puts(eventQueryState) + end + end rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) From eff92df54914482b91604b90622fd9fdf2d917eb Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 14 Nov 2018 15:48:23 -0800 Subject: [PATCH 032/160] Setting Timeout for HTTP Client in PostDataHelper in outoms go plugin(#164) --- source/code/go/src/plugins/utils.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/utils.go b/source/code/go/src/plugins/utils.go index 91e433a0f..85af80d7a 100644 --- a/source/code/go/src/plugins/utils.go +++ b/source/code/go/src/plugins/utils.go @@ -70,7 +70,10 @@ func CreateHTTPClient() { tlsConfig.BuildNameToCertificate() transport := &http.Transport{TLSClientConfig: tlsConfig} - HTTPClient = http.Client{Transport: transport} + HTTPClient = http.Client{ + Transport: transport, + Timeout: 30 * time.Second, + } Log("Successfully created HTTP Client") } From 9893e36d3aeb6a05259a45d449ad2b04453418ea Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 15 Nov 2018 17:01:18 -0800 Subject: [PATCH 033/160] Vishwa/perftelemetry 2 (#165) * add cpu usage telemetry for ds & rs * add cpu & memory usage telemetry for ds & rs --- .../code/plugin/ApplicationInsightsUtility.rb | 32 ++++++++++++ .../code/plugin/CAdvisorMetricsAPIClient.rb | 51 +++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 14fc9f2f8..78553a83f 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -5,6 +5,7 @@ class ApplicationInsightsUtility require_relative 'lib/application_insights' require_relative 'omslog' require_relative 'DockerApiClient' + require_relative 'oms_common' require 'json' require 'base64' @@ -20,6 +21,7 @@ class ApplicationInsightsUtility @@EnvApplicationInsightsKey = 'APPLICATIONINSIGHTS_AUTH' @@CustomProperties = {} @@Tc = nil + @@hostName = (OMS::Common.get_hostname) def initialize end @@ -124,6 +126,36 @@ def sendTelemetry(pluginName, properties) end end + #Method to send metric. It will merge passed-in properties with common custom properties + def sendMetricTelemetry(metricName, metricValue, properties) + begin + if (metricName.empty? || metricName.nil?) + $log.warn("SendMetricTelemetry: metricName is missing") + return + end + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility + end + telemetryProps = {} + telemetryProps["Computer"] = @@hostName + # add common dimensions + @@CustomProperties.each{ |k,v| telemetryProps[k]=v} + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each{ |k,v| telemetryProps[k]=v} + end + if !(@@Tc.nil?) + @@Tc.track_metric metricName, metricValue, + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights metric Telemetry #{metricName} sent successfully") + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") + end + end + def getWorkspaceId() begin adminConf = {} diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index c10cbad4a..9e47e5a9e 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -12,6 +12,7 @@ class CAdvisorMetricsAPIClient require_relative 'oms_common' require_relative 'KubernetesApiClient' + require_relative 'ApplicationInsightsUtility' @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10*1048576) #keep last 2 files, max log file size = 10M @@ -19,6 +20,8 @@ class CAdvisorMetricsAPIClient @@rxBytesTimeLast = nil @@txBytesLast = nil @@txBytesTimeLast = nil + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i def initialize end @@ -97,10 +100,15 @@ def getMetrics() def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) metricItems = [] clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference/60 begin metricInfo = metricJSON metricInfo['pods'].each do |pod| podUid = pod['podRef']['uid'] + podName = pod['podRef']['name'] + podNamespace = pod['podRef']['namespace'] + if (!pod['containers'].nil?) pod['containers'].each do |container| #cpu metric @@ -124,9 +132,29 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met metricProps['Collections'].push(metricCollections) metricItem['DataItems'].push(metricProps) metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("cpuUsageNanoCores")) + + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps['PodName'] = podName + telemetryProps['ContainerName'] = containerName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") + end end end end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + end rescue => error @Log.warn("getcontainerCpuMetricItems failed: #{error} for metric #{cpuMetricNameToCollect}") return metricItems @@ -137,10 +165,14 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) metricItems = [] clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference/60 begin metricInfo = metricJSON metricInfo['pods'].each do |pod| podUid = pod['podRef']['uid'] + podName = pod['podRef']['name'] + podNamespace = pod['podRef']['namespace'] if (!pod['containers'].nil?) pod['containers'].each do |container| containerName = container['name'] @@ -164,9 +196,28 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec metricProps['Collections'].push(metricCollections) metricItem['DataItems'].push(metricProps) metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("memoryRssBytes")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps['PodName'] = podName + telemetryProps['ContainerName'] = containerName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerMemoryMetricItems failed: #{errorStr} for metric #{memoryMetricNameToCollect}") + end end end end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("memoryRssBytes")) + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + end rescue => error @Log.warn("getcontainerMemoryMetricItems failed: #{error} for metric #{memoryMetricNameToCollect}") @Log.warn metricJSON From 4f3c8988e4d1a989f8e9ab0e897443f1f4a94563 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 27 Nov 2018 10:39:41 -0800 Subject: [PATCH 034/160] environment variable fix (#166) * environment variable fix * updating agent version --- installer/conf/td-agent-bit.conf | 2 +- source/code/plugin/in_containerinventory.rb | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 2a6199987..fe174f9a5 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -28,4 +28,4 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod10162018-2 + AgentVersion internaltest1126 diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index 43811e1e1..f501421a2 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -56,6 +56,11 @@ def obtainContainerConfig(instance, container) envValue = configValue['Env'] envValueString = (envValue.nil?) ? "" : envValue.to_s + # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE + if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) + envValueString = ["AZMON_COLLECT_ENV=FALSE"] + $log.warn("Environment Variable collection for container: #{container['Id']} skipped because AZMON_COLLECT_ENV is set to false") + end # Restricting the ENV string value to 200kb since the size of this string can go very high if envValueString.length > 200000 envValueStringTruncated = envValueString.slice(0..200000) From 5e16467696df96d59d32d7219b901c1450b44201 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 27 Nov 2018 11:20:51 -0800 Subject: [PATCH 035/160] Fixing a bug where we were crashing due to container statuses not present when not was lost (#167) --- source/code/plugin/in_kube_podinventory.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 2cd1e1bc3..ec76bac61 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -101,7 +101,7 @@ def parse_and_emit_records(podInventory, serviceList) #podStatus # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running podReadyCondition = true - if !items['status']['reason'].nil? && items['status']['reason'] == "NodeLost" + if !items['status']['reason'].nil? && items['status']['reason'] == "NodeLost" && !items['status']['conditions'].nil? items['status']['conditions'].each do |condition| if condition['type'] == "Ready" && condition['status'] == "False" podReadyCondition = false From b482b1ecb667d4f75cd3902c5baf6debd25990ef Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 28 Nov 2018 17:37:41 -0800 Subject: [PATCH 036/160] Updating title --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0c543e716..8755cedb3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# AKS Container Health monitoring +# Azure Monitor for Containers ## Code of Conduct @@ -40,4 +40,4 @@ additional questions or comments. - Kubernetes RBAC enablement - Latest released omsagent (1.6.0-42) - Bug fix so that we do not collect kube-system namespace container logs when kube api calls fail occasionally (Bug #215107) -- .yaml changes (for RBAC) \ No newline at end of file +- .yaml changes (for RBAC) From d75ba897b9ccd58a4ad8a049b87b09a990ea7934 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 28 Nov 2018 17:40:41 -0800 Subject: [PATCH 037/160] updating right versions for last release --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8755cedb3..ace2ff57b 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ additional questions or comments. ## Release History -### 10/16/2018 - Version microsoft/oms:ciprod10162018 +### 10/16/2018 - Version microsoft/oms:ciprod10162018-2 - Fix for containerID being 00000-00000-00000 - Move from fluentD to fluentbit for container log collection - Seg fault fixes in json parsing for container inventory & container image inventory From cbd815c90bea4f7878eb6c0908f3d0456737dbd5 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 29 Nov 2018 11:25:15 -0800 Subject: [PATCH 038/160] Updating the break condition to look for end of response (#168) * Updating the break condition to look for end of response * changes for docker response --- source/code/plugin/DockerApiClient.rb | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index b93411980..e12ef13ec 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -19,7 +19,7 @@ def initialize class << self # Make docker socket call for requests - def getResponse(request, isMultiJson) + def getResponse(request, isMultiJson, isVersion) begin socket = UNIXSocket.new(@@SocketPath) dockerResponse = "" @@ -36,8 +36,9 @@ def getResponse(request, isMultiJson) rescue Timeout::Error $log.warn("Socket read timedout for request: #{request} @ #{Time.now.utc.iso8601}") isTimeOut = true + break end - break if responseChunk.length < @@ChunkSize + break if (isVersion)? (responseChunk.length < @@ChunkSize) : (responseChunk.end_with? "0\r\n\r\n") end socket.close return (isTimeOut)? nil : parseResponse(dockerResponse, isMultiJson) @@ -71,7 +72,7 @@ def parseResponse(dockerResponse, isMultiJson) def getDockerHostName() dockerHostName = "" request = DockerApiRestHelper.restDockerInfo - response = getResponse(request, false) + response = getResponse(request, false, false) if (response != nil) dockerHostName = response['Name'] end @@ -81,7 +82,7 @@ def getDockerHostName() def listContainers() ids = [] request = DockerApiRestHelper.restDockerPs - containers = getResponse(request, true) + containers = getResponse(request, true, false) if !containers.nil? && !containers.empty? containers.each do |container| ids.push(container['Id']) @@ -121,7 +122,7 @@ def getImageIdMap() result = nil begin request = DockerApiRestHelper.restDockerImages - images = getResponse(request, true) + images = getResponse(request, true, false) if !images.nil? && !images.empty? result = {} images.each do |image| @@ -144,13 +145,13 @@ def getImageIdMap() def dockerInspectContainer(id) request = DockerApiRestHelper.restDockerInspect(id) - return getResponse(request, false) + return getResponse(request, false, false) end # This method returns docker version and docker api version for telemetry def dockerInfo() request = DockerApiRestHelper.restDockerVersion - response = getResponse(request, false) + response = getResponse(request, false, true) dockerInfo = {} if (response != nil) dockerInfo['Version'] = response['Version'] @@ -159,4 +160,4 @@ def dockerInfo() return dockerInfo end end -end \ No newline at end of file +end From d0d5bf78798e3d90655fc08f8a1666daa30c47d3 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 29 Nov 2018 12:01:11 -0800 Subject: [PATCH 039/160] updating AgentVersion for telemetry --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index fe174f9a5..c92bcdf07 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -28,4 +28,4 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion internaltest1126 + AgentVersion ciprod11292018 From bfe27e5c6f7c3a97dc98f9e7296f25ea2c1d5a36 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 29 Nov 2018 12:16:35 -0800 Subject: [PATCH 040/160] Updating readme for latest release changes --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index ace2ff57b..17a3cf3ad 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,22 @@ additional questions or comments. ## Release History +### 11/29/2018 - Version microsoft/oms:ciprod11292018 +- Disable Container Image inventory workflow +- Kube_Events memory leak fix for replica-set +- Timeout (30 secs) for outOMS +- Reduce critical lock duration for quicker log processing (for log enrichment) +- Disable OMI based Container Inventory workflow to fluentD based Container Inventory +- Moby support for the new Container Inventory workflow +- Ability to disable environment variables collection by individual container +- Bugfix - No inventory data due to container status(es) not available +- Agent telemetry cpu usage & memory usage (for DaemonSet and ReplicaSet) +- Agent telemetry - log generation rate +- Agent telemetry - container count per node +- Agent telemetry - collect container logs from agent (DaemonSet and ReplicaSet) as AI trace +- Agent telemetry - errors/exceptions for Container Inventory workflow +- Agent telemetry - Container Inventory Heartbeat + ### 10/16/2018 - Version microsoft/oms:ciprod10162018-2 - Fix for containerID being 00000-00000-00000 - Move from fluentD to fluentbit for container log collection From a621f883b0059db69ea1c2df48eef9671bc07b7e Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Sun, 16 Dec 2018 20:17:56 -0800 Subject: [PATCH 041/160] Changes - (#173) * use /var/log for state * new metric ContainerLogsAgentSideLatencyMs * new field 'timeOfComand' --- installer/conf/td-agent-bit.conf | 2 +- source/code/go/src/plugins/oms.go | 43 ++++++++++++++++++------- source/code/go/src/plugins/telemetry.go | 12 +++++++ 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index c3252a185..b6b9bcc44 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -8,7 +8,7 @@ Name tail Tag oms.container.log.* Path /var/log/containers/*.log - DB /var/opt/microsoft/docker-cimprov/state/fblogs.db + DB /var/log/omsagent-fblogs.db Parser docker Mem_Buf_Limit 30m Path_Key filepath diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 9876acc42..30e844915 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -77,9 +77,10 @@ var ( // DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin type DataItem struct { - LogEntry string `json:"LogEntry"` - LogEntrySource string `json:"LogEntrySource"` - LogEntryTimeStamp string `json:"LogEntryTimeStamp"` + LogEntry string `json:"LogEntry"` + LogEntrySource string `json:"LogEntrySource"` + LogEntryTimeStamp string `json:"LogEntryTimeStamp"` + LogEntryTimeOfCommand string `json:"TimeOfCommand"` ID string `json:"Id"` Image string `json:"Image"` Name string `json:"Name"` @@ -204,6 +205,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { start := time.Now() var dataItems []DataItem + var maxLatency float64 + var maxLatencyContainer string ignoreIDSet := make(map[string]bool) imageIDMap := make(map[string]string) nameIDMap := make(map[string]string) @@ -248,18 +251,32 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { Log("ContainerId %s not present in Map ", containerID) } + dataItem := DataItem{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - SourceSystem: stringMap["SourceSystem"], - Computer: Computer, - Image: stringMap["Image"], - Name: stringMap["Name"], + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + LogEntryTimeOfCommand: start.Format(time.RFC3339), + SourceSystem: stringMap["SourceSystem"], + Computer: Computer, + Image: stringMap["Image"], + Name: stringMap["Name"], } dataItems = append(dataItems, dataItem) + loggedTime, e := time.Parse(time.RFC3339, dataItem.LogEntryTimeStamp) + if e!= nil { + message := fmt.Sprintf("Error while converting LogEntryTimeStamp for telemetry purposes: %s", e.Error()) + Log(message) + SendException(message) + } else { + ltncy := float64(start.Sub(loggedTime) / time.Millisecond) + if ltncy >= maxLatency { + maxLatency = ltncy + maxLatencyContainer = dataItem.Name + "=" + dataItem.ID + } + } } if len(dataItems) > 0 { @@ -302,6 +319,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { ContainerLogTelemetryMutex.Lock() FlushedRecordsCount += float64(numRecords) FlushedRecordsTimeTaken += float64(elapsed / time.Millisecond) + if maxLatency >= AgentLogProcessingMaxLatencyMs { + AgentLogProcessingMaxLatencyMs = maxLatency + AgentLogProcessingMaxLatencyMsContainer = maxLatencyContainer + } ContainerLogTelemetryMutex.Unlock() } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 5952ac9ac..0d5513362 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -17,6 +17,10 @@ var ( FlushedRecordsCount float64 // FlushedRecordsTimeTaken indicates the cumulative time taken to flush the records for the current period FlushedRecordsTimeTaken float64 + // This is telemetry for how old/latent logs we are processing in milliseconds (max over a period of time) + AgentLogProcessingMaxLatencyMs float64 + // This is telemetry for which container logs were latent (max over a period of time) + AgentLogProcessingMaxLatencyMsContainer string // CommonProperties indicates the dimensions that are sent with every event/metric CommonProperties map[string]string // TelemetryClient is the client used to send the telemetry @@ -35,6 +39,7 @@ const ( envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" + metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" defaultTelemetryPushIntervalSeconds = 300 eventNameContainerLogInit = "ContainerLogPluginInitialized" @@ -62,12 +67,19 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { logRate := FlushedRecordsCount / float64(elapsed/time.Second) FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 + logLatencyMs := AgentLogProcessingMaxLatencyMs + logLatencyMsContainer := AgentLogProcessingMaxLatencyMsContainer + AgentLogProcessingMaxLatencyMs = 0 + AgentLogProcessingMaxLatencyMsContainer = "" ContainerLogTelemetryMutex.Unlock() flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) TelemetryClient.Track(flushRateMetric) logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) TelemetryClient.Track(logRateMetric) + logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) + logLatencyMetric.Properties["Container"] = logLatencyMsContainer + TelemetryClient.Track(logLatencyMetric) start = time.Now() } } From c9cf4fd7e5b3176136b47390ba405ee6afd6719b Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 17 Dec 2018 13:58:09 -0800 Subject: [PATCH 042/160] Rashmi/kubenodeinventory (#174) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id --- installer/conf/container.conf | 23 ---------- installer/conf/kube.conf | 13 ++++++ .../code/plugin/ApplicationInsightsUtility.rb | 6 +-- source/code/plugin/in_kube_nodes.rb | 45 ++++++++++++++++--- source/code/plugin/in_kube_podinventory.rb | 20 +++++++++ 5 files changed, 76 insertions(+), 31 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 798bd8eb6..091753230 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -15,16 +15,6 @@ log_level debug -# Container host inventory - - type omi - run_interval 60s - tag oms.api.ContainerNodeInventory - items [ - ["root/cimv2","Container_HostInventory"] - ] - - #cadvisor perf type cadvisorperf @@ -33,19 +23,6 @@ log_level debug - - type out_oms_api - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - type out_oms log_level debug diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 94fe2ef0b..22c51ad0e 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -118,6 +118,19 @@ max_retry_wait 9m + + type out_oms_api + log_level debug + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 15s + max_retry_wait 9m + + type out_oms log_level debug diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 78553a83f..76e0b2926 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -83,7 +83,7 @@ def sendHeartBeatEvent(pluginName) end end - def sendCustomEvent(pluginName, properties) + def sendCustomMetric(pluginName, properties) begin if !(@@Tc.nil?) @@Tc.track_metric 'LastProcessedContainerInventoryCount', properties['ContainerCount'], @@ -93,7 +93,7 @@ def sendCustomEvent(pluginName, properties) $log.info("AppInsights Container Count Telemetry sent successfully") end rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") + $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") end end @@ -120,7 +120,7 @@ def sendTelemetry(pluginName, properties) end @@CustomProperties['Computer'] = properties['Computer'] sendHeartBeatEvent(pluginName) - sendCustomEvent(pluginName, properties) + sendCustomMetric(pluginName, properties) rescue => errorStr $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") end diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index edbbdd37f..1c792d0da 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -6,12 +6,15 @@ module Fluent class Kube_nodeInventory_Input < Input Plugin.register_input('kubenodeinventory', self) + @@ContainerNodeInventoryTag = 'oms.api.ContainerNodeInventory' + def initialize super require 'yaml' require 'json' require_relative 'KubernetesApiClient' + require_relative 'ApplicationInsightsUtility' require_relative 'oms_common' require_relative 'omslog' end @@ -29,6 +32,7 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -46,15 +50,22 @@ def enumerate currentTime = Time.now emitTime = currentTime.to_f batchTime = currentTime.utc.iso8601 - $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('nodes').body) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + telemetrySent = false + $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('nodes').body) + $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") begin if(!nodeInventory.empty?) eventStream = MultiEventStream.new + containerNodeInventoryEventStream = MultiEventStream.new #get node inventory nodeInventory['items'].each do |items| record = {} + # Sending records for ContainerNodeInventory + containerNodeInventoryRecord = {} + containerNodeInventoryRecord['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord['Computer'] = items['metadata']['name'] + record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated record['Computer'] = items['metadata']['name'] record['ClusterName'] = KubernetesApiClient.getClusterName @@ -89,16 +100,40 @@ def enumerate end - record['KubeletVersion'] = items['status']['nodeInfo']['kubeletVersion'] - record['KubeProxyVersion'] = items['status']['nodeInfo']['kubeProxyVersion'] + nodeInfo = items['status']['nodeInfo'] + record['KubeletVersion'] = nodeInfo['kubeletVersion'] + record['KubeProxyVersion'] = nodeInfo['kubeProxyVersion'] + containerNodeInventoryRecord['OperatingSystem'] = nodeInfo['osImage'] + dockerVersion = nodeInfo['containerRuntimeVersion'] + dockerVersion.slice! "docker://" + containerNodeInventoryRecord['DockerVersion'] = dockerVersion + # ContainerNodeInventory data for docker version and operating system. + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord + wrapper = { "DataType"=>"KUBE_NODE_INVENTORY_BLOB", "IPName"=>"ContainerInsights", "DataItems"=>[record.each{|k,v| record[k]=v}] } eventStream.add(emitTime, wrapper) if wrapper + # Adding telemetry to send node telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference/60 + if (timeDifferenceInMinutes >= 5) + properties = {} + properties["Computer"] = record["Computer"] + ApplicationInsightsUtility.sendMetricTelemetry("KubeletVersion", record["KubeletVersion"] , properties) + capacityInfo = items['status']['capacity'] + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"] , properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"] , properties) + telemetrySent = true + end end router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + if telemetrySent == true + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + end @@istestvar = ENV['ISTEST'] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index ec76bac61..c6873e8fe 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -10,8 +10,10 @@ def initialize super require 'yaml' require 'json' + require 'set' require_relative 'KubernetesApiClient' + require_relative 'ApplicationInsightsUtility' require_relative 'oms_common' require_relative 'omslog' end @@ -29,6 +31,7 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) + @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -71,6 +74,8 @@ def parse_and_emit_records(podInventory, serviceList) emitTime = currentTime.to_f batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new + controllerSet = Set.new [] + telemetryFlush = false begin #begin block start podInventory['items'].each do |items| #podInventory block start records = [] @@ -78,6 +83,7 @@ def parse_and_emit_records(podInventory, serviceList) record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated record['Name'] = items['metadata']['name'] podNameSpace = items['metadata']['namespace'] + if podNameSpace.eql?("kube-system") && !items['metadata'].key?("ownerReferences") # The above case seems to be the only case where you have horizontal scaling of pods # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash @@ -129,9 +135,18 @@ def parse_and_emit_records(podInventory, serviceList) record['ClusterId'] = KubernetesApiClient.getClusterId record['ClusterName'] = KubernetesApiClient.getClusterName record['ServiceName'] = getServiceNameFromLabels(items['metadata']['namespace'], items['metadata']['labels'], serviceList) + # Adding telemetry to send pod telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference/60 + if (timeDifferenceInMinutes >= 5) + telemetryFlush = true + end if !items['metadata']['ownerReferences'].nil? record['ControllerKind'] = items['metadata']['ownerReferences'][0]['kind'] record['ControllerName'] = items['metadata']['ownerReferences'][0]['name'] + if telemetryFlush == true + controllerSet.add(record['ControllerKind'] + record['ControllerName']) + end end podRestartCount = 0 record['PodRestartCount'] = 0 @@ -191,6 +206,11 @@ def parse_and_emit_records(podInventory, serviceList) end end #podInventory block end router.emit_stream(@tag, eventStream) if eventStream + if telemetryFlush == true + ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory['items'].length , {}) + ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length , {}) + @@podTelemetryTimeTracker = DateTime.now.to_time.to_i + end @@istestvar = ENV['ISTEST'] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") From df6f1228a4649df3fb1bae1c9ea02f22daca8efd Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 20 Dec 2018 15:27:18 -0800 Subject: [PATCH 043/160] Get cpuusage from usageseconds (#175) --- .../code/plugin/CAdvisorMetricsAPIClient.rb | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 9e47e5a9e..03d6f89f5 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -20,8 +20,11 @@ class CAdvisorMetricsAPIClient @@rxBytesTimeLast = nil @@txBytesLast = nil @@txBytesTimeLast = nil + @@nodeCpuUsageNanoSecondsLast = nil + @@nodeCpuUsageNanoSecondsTimeLast = nil @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + def initialize end @@ -73,7 +76,7 @@ def getMetrics() metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "cpu", "usageNanoCores", "cpuUsageNanoCores")) + metricDataItems.push(getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores")) metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) @@ -274,24 +277,41 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl metricValue = node[metricCategory][metricNameToCollect] metricTime = node[metricCategory]['time'] - if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" ) - @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes & txBytes and not for #{metricNameToCollect}") + if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds" ) + @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") return nil elsif metricNameToCollect == "rxBytes" - if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? + if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true @@rxBytesLast = metricValue @@rxBytesTimeLast = metricTime return nil else - metricValue = ((metricValue - @@rxBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) + metricRateValue = ((metricValue - @@rxBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) + @@rxBytesLast = metricValue + @@rxBytesTimeLast = metricTime + metricValue = metricRateValue end - else - if @@txBytesLast.nil? || @@txBytesTimeLast.nil? + elsif metricNameToCollect == "txBytes" + if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true @@txBytesLast = metricValue @@txBytesTimeLast = metricTime return nil else - metricValue = ((metricValue - @@txBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) + metricRateValue = ((metricValue - @@txBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) + @@txBytesLast = metricValue + @@txBytesTimeLast = metricTime + metricValue = metricRateValue + end + else + if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + return nil + else + metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + metricValue = metricRateValue end end From dac99311485f2600f9a1fd7b6c48470ada40e8ef Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 21 Dec 2018 10:46:56 -0800 Subject: [PATCH 044/160] Rashmi/kubenodeinventory (#176) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs --- .../code/plugin/ApplicationInsightsUtility.rb | 28 +++++++++++++------ source/code/plugin/DockerApiClient.rb | 3 +- source/code/plugin/in_containerinventory.rb | 1 + source/code/plugin/in_kube_events.rb | 6 ++++ source/code/plugin/in_kube_nodes.rb | 3 ++ source/code/plugin/in_kube_podinventory.rb | 6 ++++ source/code/plugin/in_kube_services.rb | 4 +++ 7 files changed, 42 insertions(+), 9 deletions(-) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 76e0b2926..2b2db673b 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -13,12 +13,12 @@ class ApplicationInsightsUtility @@Exception = 'ExceptionEvent' @@AcsClusterType = 'ACS' @@AksClusterType = 'AKS' - @@DaemonsetControllerType = 'DaemonSet' @OmsAdminFilePath = '/etc/opt/microsoft/omsagent/conf/omsadmin.conf' @@EnvAcsResourceName = 'ACS_RESOURCE_NAME' @@EnvAksRegion = 'AKS_REGION' @@EnvAgentVersion = 'AGENT_VERSION' @@EnvApplicationInsightsKey = 'APPLICATIONINSIGHTS_AUTH' + @@EnvControllerType = 'CONTROLLER_TYPE' @@CustomProperties = {} @@Tc = nil @@hostName = (OMS::Common.get_hostname) @@ -54,12 +54,10 @@ def initializeUtility() @@CustomProperties["ClusterName"] = clusterName @@CustomProperties["Region"] = ENV[@@EnvAksRegion] end - @@CustomProperties['ControllerType'] = @@DaemonsetControllerType - dockerInfo = DockerApiClient.dockerInfo - @@CustomProperties['DockerVersion'] = dockerInfo['Version'] - @@CustomProperties['DockerApiVersion'] = dockerInfo['ApiVersion'] + getDockerInfo() @@CustomProperties['WorkspaceID'] = getWorkspaceId @@CustomProperties['AgentVersion'] = ENV[@@EnvAgentVersion] + @@CustomProperties['ControllerType'] = ENV[@@EnvControllerType] encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] if !encodedAppInsightsKey.nil? decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) @@ -70,6 +68,14 @@ def initializeUtility() end end + def getDockerInfo() + dockerInfo = DockerApiClient.dockerInfo + if (!dockerInfo.nil? && !dockerInfo.empty?) + @@CustomProperties['DockerVersion'] = dockerInfo['Version'] + @@CustomProperties['DockerApiVersion'] = dockerInfo['ApiVersion'] + end + end + def sendHeartBeatEvent(pluginName) begin eventName = pluginName + @@HeartBeat @@ -100,7 +106,9 @@ def sendCustomMetric(pluginName, properties) def sendExceptionTelemetry(errorStr) begin if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility + initializeUtility() + elsif @@CustomProperties['DockerVersion'].nil? + getDockerInfo() end if !(@@Tc.nil?) @@Tc.track_exception errorStr , :properties => @@CustomProperties @@ -116,7 +124,9 @@ def sendExceptionTelemetry(errorStr) def sendTelemetry(pluginName, properties) begin if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility + initializeUtility() + elsif @@CustomProperties['DockerVersion'].nil? + getDockerInfo() end @@CustomProperties['Computer'] = properties['Computer'] sendHeartBeatEvent(pluginName) @@ -134,7 +144,9 @@ def sendMetricTelemetry(metricName, metricValue, properties) return end if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility + initializeUtility() + elsif @@CustomProperties['DockerVersion'].nil? + getDockerInfo() end telemetryProps = {} telemetryProps["Computer"] = @@hostName diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index e12ef13ec..903256f6d 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -10,10 +10,11 @@ class DockerApiClient require_relative 'DockerApiRestHelper' require_relative 'ApplicationInsightsUtility' - @@SocketPath = "/var/run/docker.sock" + @@SocketPath = "/var/run/host/docker.sock" @@ChunkSize = 4096 @@TimeoutInSeconds = 5 @@PluginName = 'ContainerInventory' + def initialize end diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index f501421a2..a38697741 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -19,6 +19,7 @@ def initialize require_relative 'ContainerInventoryState' require_relative 'ApplicationInsightsUtility' require_relative 'omslog' + end config_param :run_interval, :time, :default => '1m' diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 5df31df95..b7be24510 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -15,6 +15,8 @@ def initialize require_relative 'KubernetesApiClient' require_relative 'oms_common' require_relative 'omslog' + require_relative 'ApplicationInsightsUtility' + end config_param :run_interval, :time, :default => '1m' @@ -94,6 +96,7 @@ def enumerate(eventList = nil) rescue => errorStr $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @@ -110,6 +113,7 @@ def run_periodic enumerate rescue => errorStr $log.warn "in_kube_events::run_periodic: enumerate Failed to retrieve kube events: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @mutex.lock @@ -129,6 +133,7 @@ def getEventQueryState rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return eventQueryState end @@ -144,6 +149,7 @@ def writeEventQueryState(eventQueryState) rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 1c792d0da..85153b21c 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -17,6 +17,7 @@ def initialize require_relative 'ApplicationInsightsUtility' require_relative 'oms_common' require_relative 'omslog' + end config_param :run_interval, :time, :default => '1m' @@ -142,6 +143,7 @@ def enumerate rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @@ -158,6 +160,7 @@ def run_periodic enumerate rescue => errorStr $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @mutex.lock diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index c6873e8fe..eaf14b035 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -16,6 +16,7 @@ def initialize require_relative 'ApplicationInsightsUtility' require_relative 'oms_common' require_relative 'omslog' + end config_param :run_interval, :time, :default => '1m' @@ -66,6 +67,7 @@ def enumerate(podList = nil) rescue => errorStr $log.warn "Failed in enumerate pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @@ -207,6 +209,7 @@ def parse_and_emit_records(podInventory, serviceList) end #podInventory block end router.emit_stream(@tag, eventStream) if eventStream if telemetryFlush == true + ApplicationInsightsUtility.sendHeartBeatEvent("KubePodInventory") ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory['items'].length , {}) ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length , {}) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i @@ -218,6 +221,7 @@ def parse_and_emit_records(podInventory, serviceList) rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end #begin block end end @@ -234,6 +238,7 @@ def run_periodic enumerate rescue => errorStr $log.warn "in_kube_podinventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @mutex.lock @@ -268,6 +273,7 @@ def getServiceNameFromLabels(namespace, labels, serviceList) rescue => errorStr $log.warn "Failed to retrieve service name from labels: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName end diff --git a/source/code/plugin/in_kube_services.rb b/source/code/plugin/in_kube_services.rb index 9a33f4581..655beef59 100644 --- a/source/code/plugin/in_kube_services.rb +++ b/source/code/plugin/in_kube_services.rb @@ -14,6 +14,8 @@ def initialize require_relative 'KubernetesApiClient' require_relative 'oms_common' require_relative 'omslog' + require_relative 'ApplicationInsightsUtility' + end config_param :run_interval, :time, :default => '1m' @@ -70,6 +72,7 @@ def enumerate rescue => errorStr $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(e.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @@ -86,6 +89,7 @@ def run_periodic enumerate rescue => errorStr $log.warn "in_kube_services::run_periodic: enumerate Failed to kube services: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @mutex.lock From 04cc1a87e64cae65ffeba3b061312dcb35959b51 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 26 Dec 2018 10:32:22 -0800 Subject: [PATCH 045/160] Rashmi/kubenodeinventory (#178) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type --- installer/conf/kube.conf | 19 ++++++++++--------- source/code/plugin/in_kube_events.rb | 9 +++++++-- source/code/plugin/in_kube_services.rb | 9 +++++++-- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 22c51ad0e..6331d257e 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -11,7 +11,7 @@ #Kubernetes events type kubeevents - tag oms.api.KubeEvents.CollectionTime + tag oms.containerinsights.KubeEvents run_interval 60s log_level debug @@ -26,7 +26,7 @@ #Kubernetes services type kubeservices - tag oms.api.KubeServices.CollectionTime + tag oms.containerinsights.KubeServices run_interval 60s log_level debug @@ -62,18 +62,19 @@ max_retry_wait 9m - - type out_oms_api + + type out_oms log_level debug - num_threads 5 + num_threads 5 buffer_chunk_limit 5m buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_kubeevents*.buffer + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer buffer_queue_limit 10 - buffer_queue_full_action drop_oldest_chunk + buffer_queue_full_action drop_oldest_chunk flush_interval 20s retry_limit 10 retry_wait 30s + max_retry_wait 9m @@ -88,8 +89,8 @@ retry_wait 30s - - type out_oms_api + + type out_oms log_level debug num_threads 5 buffer_chunk_limit 20m diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index b7be24510..309dd8034 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -20,7 +20,7 @@ def initialize end config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.api.KubeEvents.CollectionTime" + config_param :tag, :string, :default => "oms.containerinsights.KubeEvents" def configure (conf) super @@ -88,7 +88,12 @@ def enumerate(eventList = nil) end record['ClusterName'] = KubernetesApiClient.getClusterName record['ClusterId'] = KubernetesApiClient.getClusterId - eventStream.add(emitTime, record) if record + wrapper = { + "DataType"=>"KUBE_EVENTS_BLOB", + "IPName"=>"ContainerInsights", + "DataItems"=>[record.each{|k,v| record[k]=v}] + } + eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream end diff --git a/source/code/plugin/in_kube_services.rb b/source/code/plugin/in_kube_services.rb index 655beef59..e1bb93f30 100644 --- a/source/code/plugin/in_kube_services.rb +++ b/source/code/plugin/in_kube_services.rb @@ -19,7 +19,7 @@ def initialize end config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.api.KubeServices.CollectionTime" + config_param :tag, :string, :default => "oms.containerinsights.KubeServices" def configure (conf) super @@ -65,7 +65,12 @@ def enumerate record['ClusterIP'] = items['spec']['clusterIP'] record['ServiceType'] = items['spec']['type'] # : Add ports and status fields - eventStream.add(emitTime, record) if record + wrapper = { + "DataType"=>"KUBE_SERVICES_BLOB", + "IPName"=>"ContainerInsights", + "DataItems"=>[record.each{|k,v| record[k]=v}] + } + eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream end From 5883f5368cc9704879b25a145fec80906d91d826 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 26 Dec 2018 13:36:48 -0800 Subject: [PATCH 046/160] Fixing an issue on the cpurate metric, which happens for the first time (when cache is empty) (#179) --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 03d6f89f5..97eec06ab 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -76,7 +76,10 @@ def getMetrics() metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) - metricDataItems.push(getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores")) + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores") + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) From 191f3285dad2065f83b57b4b3e55fad6709b15ab Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 28 Dec 2018 12:27:46 -0800 Subject: [PATCH 047/160] Rashmi/kubenodeinventory (#180) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension --- source/code/plugin/in_kube_nodes.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 85153b21c..a6908fc99 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -123,7 +123,7 @@ def enumerate if (timeDifferenceInMinutes >= 5) properties = {} properties["Computer"] = record["Computer"] - ApplicationInsightsUtility.sendMetricTelemetry("KubeletVersion", record["KubeletVersion"] , properties) + properties["KubeletVersion"] = record["KubeletVersion"] capacityInfo = items['status']['capacity'] ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"] , properties) ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"] , properties) From 7e52e8c5553bda70dd33a4afccbcb134657b42be Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 7 Jan 2019 15:44:25 -0800 Subject: [PATCH 048/160] Exclude docker containers from container inventory (#181) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension * Excluding raw docker containers from container inventory * making labels key case insensitive * make poduid label case insensitive --- source/code/plugin/DockerApiClient.rb | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index 903256f6d..d04bf0589 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -86,7 +86,15 @@ def listContainers() containers = getResponse(request, true, false) if !containers.nil? && !containers.empty? containers.each do |container| - ids.push(container['Id']) + labels = (!container['Labels'].nil?)? container['Labels'] : container['labels'] + if !labels.nil? + labelKeys = labels.keys + #Case insensitive lookup for pod uid label + keyValue = labelKeys.find {|k| 'io.kubernetes.pod.uid'.downcase == k.downcase} + if !labels[keyValue].nil? + ids.push(container['Id']) + end + end end end return ids From f0591f9e70056c61269f3a961906a908845a1cdd Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 8 Jan 2019 15:10:41 -0800 Subject: [PATCH 049/160] Exclude pauseamd64 containers from container inventory (#182) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension * Excluding raw docker containers from container inventory * making labels key case insensitive * make poduid label case insensitive * changes to exclude pause amd 64 containers --- source/code/plugin/DockerApiClient.rb | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index d04bf0589..5a46b5fdb 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -89,10 +89,18 @@ def listContainers() labels = (!container['Labels'].nil?)? container['Labels'] : container['labels'] if !labels.nil? labelKeys = labels.keys - #Case insensitive lookup for pod uid label - keyValue = labelKeys.find {|k| 'io.kubernetes.pod.uid'.downcase == k.downcase} - if !labels[keyValue].nil? - ids.push(container['Id']) + dockerTypeLabel = labelKeys.find {|k| 'io.kubernetes.docker.type'.downcase == k.downcase} + if !dockerTypeLabel.nil? + dockerTypeLabelValue = labels[dockerTypeLabel] + # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers + if !(dockerTypeLabelValue.downcase == "podsandbox".downcase) + # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that + # are created in the pods for ContainerInventory + keyValue = labelKeys.find {|k| 'io.kubernetes.pod.uid'.downcase == k.downcase} + if !labels[keyValue].nil? + ids.push(container['Id']) + end + end end end end From 4782435a228c3626b25d8bf1682a0d977e79eb23 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 9 Jan 2019 11:22:53 -0800 Subject: [PATCH 050/160] Update agent version --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index b6b9bcc44..29c98bdf1 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -28,5 +28,5 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod11292018 + AgentVersion ciprod01092019 From 23bcc4198c3ead32fb0404afeaddac83b3c23b78 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 9 Jan 2019 13:19:06 -0800 Subject: [PATCH 051/160] Updating readme for the latest release --- README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fc43d6605..5c65308fb 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,29 @@ additional questions or comments. ## Release History Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) - + +### 10/09/2018 - Version microsoft/oms:ciprod01092019 +- Omsagent - 1.8.1.256 (nov 2018 release) +- Persist fluentbit state between container restarts +- Populate 'TimeOfCommand' for agent ingest time for container logs +- Get node cpu usage from cpuusagenanoseconds (and convert to cpuusgaenanocores) +- Container Node Inventory - move to fluentD from OMI +- Mount docker.sock (Daemon set) as /var/run/host +- Liveness probe (Daemon set) - check for omsagent user permissions in docker.sock and update as necessary (required when docker daemon gets restarted) +- Move to fixed type for kubeevents & kubeservices +- Disable collecting ENV for our oms agent container (daemonset & replicaset) +- Disable container inventory collection for 'sandbox' containers & non kubernetes managed containers +- Agent telemetry - ContainerLogsAgentSideLatencyMs +- Agent telemetry - PodCount +- Agent telemetry - ControllerCount +- Agent telemetry - K8S Version +- Agent telemetry - NodeCoreCapacity +- Agent telemetry - NodeMemoryCapacity +- Agent telemetry - KubeEvents (exceptions) +- Agent telemetry - Kubenodes (exceptions) +- Agent telemetry - kubepods (exceptions) +- Agent telemetry - kubeservices (exceptions) +- Agent telemetry - Daemonset , Replicaset as dimensions (bug fix) ### 11/29/2018 - Version microsoft/oms:ciprod11292018 - Disable Container Image inventory workflow From 51d5e938d436584bc094d72361d8652dd51db8bd Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 11 Jan 2019 13:08:56 -0800 Subject: [PATCH 052/160] Fix indentation in kube.conf and update readme (#184) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension * Excluding raw docker containers from container inventory * making labels key case insensitive * make poduid label case insensitive * changes to exclude pause amd 64 containers * fixing indentation so that kube.conf contents can be used in config map in the yaml * updating readme to fix date and agent version --- README.md | 6 +- installer/conf/kube.conf | 270 +++++++++++++++++++-------------------- 2 files changed, 138 insertions(+), 138 deletions(-) diff --git a/README.md b/README.md index 5c65308fb..dd55f810e 100644 --- a/README.md +++ b/README.md @@ -11,21 +11,21 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 10/09/2018 - Version microsoft/oms:ciprod01092019 +### 01/09/2018 - Version microsoft/oms:ciprod01092019-2 - Omsagent - 1.8.1.256 (nov 2018 release) - Persist fluentbit state between container restarts - Populate 'TimeOfCommand' for agent ingest time for container logs - Get node cpu usage from cpuusagenanoseconds (and convert to cpuusgaenanocores) - Container Node Inventory - move to fluentD from OMI - Mount docker.sock (Daemon set) as /var/run/host -- Liveness probe (Daemon set) - check for omsagent user permissions in docker.sock and update as necessary (required when docker daemon gets restarted) +- Add omsagent user to docker group - Move to fixed type for kubeevents & kubeservices - Disable collecting ENV for our oms agent container (daemonset & replicaset) - Disable container inventory collection for 'sandbox' containers & non kubernetes managed containers - Agent telemetry - ContainerLogsAgentSideLatencyMs - Agent telemetry - PodCount - Agent telemetry - ControllerCount -- Agent telemetry - K8S Version +- Agent telemetry - K8S Version - Agent telemetry - NodeCoreCapacity - Agent telemetry - NodeMemoryCapacity - Agent telemetry - KubeEvents (exceptions) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 6331d257e..164865022 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -1,148 +1,148 @@ -# Fluentd config file for OMS Docker - cluster components (kubeAPI) + # Fluentd config file for OMS Docker - cluster components (kubeAPI) -#Kubernetes pod inventory - - type kubepodinventory - tag oms.containerinsights.KubePodInventory - run_interval 60s - log_level debug - + #Kubernetes pod inventory + + type kubepodinventory + tag oms.containerinsights.KubePodInventory + run_interval 60s + log_level debug + -#Kubernetes events - - type kubeevents - tag oms.containerinsights.KubeEvents - run_interval 60s - log_level debug - + #Kubernetes events + + type kubeevents + tag oms.containerinsights.KubeEvents + run_interval 60s + log_level debug + -#Kubernetes logs - - type kubelogs - tag oms.api.KubeLogs - run_interval 60s - + #Kubernetes logs + + type kubelogs + tag oms.api.KubeLogs + run_interval 60s + -#Kubernetes services - - type kubeservices - tag oms.containerinsights.KubeServices - run_interval 60s - log_level debug - + #Kubernetes services + + type kubeservices + tag oms.containerinsights.KubeServices + run_interval 60s + log_level debug + -#Kubernetes Nodes - - type kubenodeinventory - tag oms.containerinsights.KubeNodeInventory - run_interval 60s - log_level debug - + #Kubernetes Nodes + + type kubenodeinventory + tag oms.containerinsights.KubeNodeInventory + run_interval 60s + log_level debug + -#Kubernetes perf - - type kubeperf - tag oms.api.KubePerf - run_interval 60s - log_level debug - + #Kubernetes perf + + type kubeperf + tag oms.api.KubePerf + run_interval 60s + log_level debug + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 5m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer - buffer_queue_limit 10 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 5m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer + buffer_queue_limit 10 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + - - type out_oms_api - log_level debug - buffer_chunk_limit 10m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer - buffer_queue_limit 10 - flush_interval 20s - retry_limit 10 - retry_wait 30s - + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + - - type out_oms_api - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - + + type out_oms_api + log_level debug + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 15s + max_retry_wait 9m + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + \ No newline at end of file From decf86a3d24dece047ea4b780d10c799fbe1a1ce Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 11 Jan 2019 13:16:21 -0800 Subject: [PATCH 053/160] updating agent tag --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd55f810e..099a065e8 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 01/09/2018 - Version microsoft/oms:ciprod01092019-2 +### 01/09/2018 - Version microsoft/oms:ciprod01092019 - Omsagent - 1.8.1.256 (nov 2018 release) - Persist fluentbit state between container restarts - Populate 'TimeOfCommand' for agent ingest time for container logs From a1b35db565c9cc324733534b90e3c4f5a98651d7 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 29 Jan 2019 15:33:59 -0800 Subject: [PATCH 054/160] Get Pods for current Node Only (#185) * Fix KubeAPI Calls to filter to get pods for current node * Reinstate log line --- source/code/go/src/plugins/oms.go | 48 ++++++++++++++++--------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 5d9269d1e..49e91f87f 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -77,15 +77,15 @@ var ( // DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin type DataItem struct { - LogEntry string `json:"LogEntry"` - LogEntrySource string `json:"LogEntrySource"` - LogEntryTimeStamp string `json:"LogEntryTimeStamp"` - LogEntryTimeOfCommand string `json:"TimeOfCommand"` - ID string `json:"Id"` - Image string `json:"Image"` - Name string `json:"Name"` - SourceSystem string `json:"SourceSystem"` - Computer string `json:"Computer"` + LogEntry string `json:"LogEntry"` + LogEntrySource string `json:"LogEntrySource"` + LogEntryTimeStamp string `json:"LogEntryTimeStamp"` + LogEntryTimeOfCommand string `json:"TimeOfCommand"` + ID string `json:"Id"` + Image string `json:"Image"` + Name string `json:"Name"` + SourceSystem string `json:"SourceSystem"` + Computer string `json:"Computer"` } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point @@ -137,7 +137,10 @@ func updateContainerImageNameMaps() { _imageIDMap := make(map[string]string) _nameIDMap := make(map[string]string) - pods, err := ClientSet.CoreV1().Pods("").List(metav1.ListOptions{}) + listOptions := metav1.ListOptions{} + listOptions.FieldSelector = fmt.Sprintf("spec.nodeName=%s", Computer) + pods, err := ClientSet.CoreV1().Pods("").List(listOptions) + if err != nil { message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) Log(message) @@ -244,31 +247,30 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if val, ok := imageIDMap[containerID]; ok { stringMap["Image"] = val } else { - Log("ContainerId %s not present in Map ", containerID) + Log("ContainerId %s not present in Name Map ", containerID) } if val, ok := nameIDMap[containerID]; ok { stringMap["Name"] = val } else { - Log("ContainerId %s not present in Map ", containerID) + Log("ContainerId %s not present in Image Map ", containerID) } - dataItem := DataItem{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - LogEntryTimeOfCommand: start.Format(time.RFC3339), - SourceSystem: stringMap["SourceSystem"], - Computer: Computer, - Image: stringMap["Image"], - Name: stringMap["Name"], + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + LogEntryTimeOfCommand: start.Format(time.RFC3339), + SourceSystem: stringMap["SourceSystem"], + Computer: Computer, + Image: stringMap["Image"], + Name: stringMap["Name"], } dataItems = append(dataItems, dataItem) loggedTime, e := time.Parse(time.RFC3339, dataItem.LogEntryTimeStamp) - if e!= nil { + if e != nil { message := fmt.Sprintf("Error while converting LogEntryTimeStamp for telemetry purposes: %s", e.Error()) Log(message) SendException(message) From 22649bad0090c05eb809f0521d9222b514084b9b Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 30 Jan 2019 15:50:28 -0800 Subject: [PATCH 055/160] changes for container node inventory fixed type (#186) --- installer/conf/kube.conf | 4 ++-- source/code/plugin/in_kube_nodes.rb | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 164865022..d0ef0517d 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -119,8 +119,8 @@ max_retry_wait 9m - - type out_oms_api + + type out_oms log_level debug buffer_chunk_limit 20m buffer_type file diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index a6908fc99..2e48e3f1f 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -6,7 +6,7 @@ module Fluent class Kube_nodeInventory_Input < Input Plugin.register_input('kubenodeinventory', self) - @@ContainerNodeInventoryTag = 'oms.api.ContainerNodeInventory' + @@ContainerNodeInventoryTag = 'oms.containerinsights.ContainerNodeInventory' def initialize super @@ -109,7 +109,12 @@ def enumerate dockerVersion.slice! "docker://" containerNodeInventoryRecord['DockerVersion'] = dockerVersion # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord + containerNodeInventoryWrapper = { + "DataType"=>"CONTAINER_NODE_INVENTORY_BLOB", + "IPName"=>"ContainerInsights", + "DataItems"=>[containerNodeInventoryRecord.each{|k,v| containerNodeInventoryRecord[k]=v}] + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper wrapper = { "DataType"=>"KUBE_NODE_INVENTORY_BLOB", From 61e2eaffe3e60b51d83459a494435f3dd6002821 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 13 Feb 2019 11:38:07 -0800 Subject: [PATCH 056/160] Fix for mooncake (disable telemetry optionally) (#191) * disable telemetry option * fix a typo --- source/code/go/src/plugins/telemetry.go | 5 +++++ source/code/plugin/ApplicationInsightsUtility.rb | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 82f970d3a..a64ca2218 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -120,6 +120,11 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } TelemetryClient = appinsights.NewTelemetryClient(string(decIkey)) + telemetryOffSwitch := os.Getenv("DISABLE_TELEMETRY") + if strings.Compare(strings.ToLower(telemetryOffSwitch), "true") == 0 { + Log("Appinsights telemetry is disabled \n") + TelemetryClient.SetIsEnabled(false) + } CommonProperties = make(map[string]string) CommonProperties["Computer"] = Computer diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 27660d708..683be0db4 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -61,9 +61,16 @@ def initializeUtility() @@CustomProperties['AgentVersion'] = ENV[@@EnvAgentVersion] @@CustomProperties['ControllerType'] = ENV[@@EnvControllerType] encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] - if !encodedAppInsightsKey.nil? + + #Check if telemetry is turned off + telemetryOffSwitch = ENV['DISABLE_TELEMETRY'] + if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase + $log.warn("AppInsightsUtility: Telemetry is disabled") + @@Tc = ApplicationInsights::TelemetryClient.new + elsif !encodedAppInsightsKey.nil? decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey + end rescue => errorStr $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") From 30dff41106981b9855a89db9227ef9fccbea0158 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 15 Feb 2019 14:27:33 -0800 Subject: [PATCH 057/160] CustomMetrics to ci_feature (#193) Custom Metrics changes to ci_feature --- installer/conf/container.conf | 24 ++ installer/conf/kube.conf | 25 +- installer/datafiles/base_container.data | 14 + source/code/go/src/plugins/oms.go | 2 +- .../code/plugin/ApplicationInsightsUtility.rb | 19 +- source/code/plugin/CustomMetricsUtils.rb | 26 ++ source/code/plugin/filter_cadvisor2mdm.rb | 215 ++++++++++++++++ source/code/plugin/filter_inventory2mdm.rb | 235 +++++++++++++++++ source/code/plugin/in_cadvisor_perf.rb | 2 + source/code/plugin/in_kube_nodes.rb | 2 + source/code/plugin/in_kube_podinventory.rb | 3 + source/code/plugin/out_mdm.rb | 239 ++++++++++++++++++ 12 files changed, 802 insertions(+), 4 deletions(-) create mode 100644 source/code/plugin/CustomMetricsUtils.rb create mode 100644 source/code/plugin/filter_cadvisor2mdm.rb create mode 100644 source/code/plugin/filter_inventory2mdm.rb create mode 100644 source/code/plugin/out_mdm.rb diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 091753230..f41bd6f98 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -23,6 +23,14 @@ log_level debug +#custom_metrics_mdm filter plugin + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + log_level info + + type out_oms log_level debug @@ -52,3 +60,19 @@ retry_wait 30s max_retry_wait 9m + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index d0ef0517d..50a88295e 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -47,6 +47,12 @@ log_level debug + + type filter_inventory2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + log_level info + + type out_oms log_level debug @@ -145,4 +151,21 @@ retry_limit 10 retry_wait 30s max_retry_wait 9m - \ No newline at end of file + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path /var/opt/microsoft/omsagent/6bb1e963-b08c-43a8-b708-1628305e964a/state/out_mdm_*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 7181929e2..c263aa505 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -36,6 +36,9 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/code/plugin/in_cadvisor_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_services.rb; source/code/plugin/in_kube_services.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root +/opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root +/opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/code/plugin/CustomMetricsUtils.rb; 644; root; root + /opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/code/plugin/ApplicationInsightsUtility.rb; 644; root; root /opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/code/plugin/ContainerInventoryState.rb; 644; root; root @@ -43,6 +46,9 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/DockerApiRestHelper.rb; source/code/plugin/DockerApiRestHelper.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_containerinventory.rb; source/code/plugin/in_containerinventory.rb; 644; root; root +/opt/microsoft/omsagent/plugin/out_mdm.rb; source/code/plugin/out_mdm.rb; 644; root; root +/opt/microsoft/omsagent/plugin/filter_cadvisor2mdm.rb; source/code/plugin/filter_cadvisor2mdm.rb; 644; root; root + /opt/microsoft/omsagent/plugin/lib/application_insights/version.rb; source/code/plugin/lib/application_insights/version.rb; 644; root; root /opt/microsoft/omsagent/plugin/lib/application_insights/rack/track_request.rb; source/code/plugin/lib/application_insights/rack/track_request.rb; 644; root; root /opt/microsoft/omsagent/plugin/lib/application_insights/unhandled_exception.rb; source/code/plugin/lib/application_insights/unhandled_exception.rb; 644; root; root @@ -170,6 +176,14 @@ touch /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt chmod 666 /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt +touch /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log + +touch /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log + mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 49e91f87f..27ae6df5c 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -140,7 +140,7 @@ func updateContainerImageNameMaps() { listOptions := metav1.ListOptions{} listOptions.FieldSelector = fmt.Sprintf("spec.nodeName=%s", Computer) pods, err := ClientSet.CoreV1().Pods("").List(listOptions) - + if err != nil { message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) Log(message) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 683be0db4..5c5e92a6c 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -98,7 +98,7 @@ def sendHeartBeatEvent(pluginName) end end - def sendCustomMetric(pluginName, properties) + def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) begin if !(@@Tc.nil?) @@Tc.track_metric 'LastProcessedContainerInventoryCount', properties['ContainerCount'], @@ -112,6 +112,21 @@ def sendCustomMetric(pluginName, properties) end end + def sendCustomEvent(eventName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + end + if !(@@Tc.nil?) + @@Tc.track_event eventName, :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Custom Event #{eventName} sent successfully") + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") + end + end + def sendExceptionTelemetry(errorStr) begin if @@CustomProperties.empty? || @@CustomProperties.nil? @@ -139,7 +154,7 @@ def sendTelemetry(pluginName, properties) end @@CustomProperties['Computer'] = properties['Computer'] sendHeartBeatEvent(pluginName) - sendCustomMetric(pluginName, properties) + sendLastProcessedContainerInventoryCountMetric(pluginName, properties) rescue => errorStr $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") end diff --git a/source/code/plugin/CustomMetricsUtils.rb b/source/code/plugin/CustomMetricsUtils.rb new file mode 100644 index 000000000..d06c9ad91 --- /dev/null +++ b/source/code/plugin/CustomMetricsUtils.rb @@ -0,0 +1,26 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +class CustomMetricsUtils + def initialize + end + + class << self + def check_custom_metrics_availability(custom_metric_regions) + aks_region = ENV['AKS_REGION'] + aks_resource_id = ENV['AKS_RESOURCE_ID'] + if aks_region.to_s.empty? && aks_resource_id.to_s.empty? + false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set + end + + custom_metrics_regions_arr = custom_metric_regions.split(',') + custom_metrics_regions_hash = custom_metrics_regions_arr.map {|x| [x.downcase,true]}.to_h + + if custom_metrics_regions_hash.key?(aks_region.downcase) + true + else + false + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb new file mode 100644 index 000000000..85f9f688e --- /dev/null +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -0,0 +1,215 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. + +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + require_relative 'oms_common' + require_relative 'CustomMetricsUtils' + + class CAdvisor2MdmFilter < Filter + Fluent::Plugin.register_filter('filter_cadvisor2mdm', self) + + config_param :enable_log, :integer, :default => 0 + config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log' + config_param :custom_metrics_azure_regions, :string + config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes' + + @@cpu_usage_milli_cores = 'cpuUsageMillicores' + @@cpu_usage_nano_cores = 'cpuusagenanocores' + @@object_name_k8s_node = 'K8SNode' + @@hostName = (OMS::Common.get_hostname) + @@custom_metrics_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "Insights.Container/nodes", + "dimNames": [ + "host" + ], + "series": [ + { + "dimValues": [ + "%{hostvalue}" + ], + "min": %{metricminvalue}, + "max": %{metricmaxvalue}, + "sum": %{metricsumvalue}, + "count": 1 + } + ] + } + } + }' + + @@metric_name_metric_percentage_name_hash = { + @@cpu_usage_milli_cores => "cpuUsagePercentage", + "memoryRssBytes" => "memoryRssPercentage", + "memoryWorkingSetBytes" => "memoryWorkingSetPercentage" + } + + @process_incoming_stream = true + @metrics_to_collect_hash = {} + + def initialize + super + end + + def configure(conf) + super + @log = nil + + if @enable_log + @log = Logger.new(@log_path, 'weekly') + @log.debug {'Starting filter_cadvisor2mdm plugin'} + end + end + + def start + super + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @metrics_to_collect_hash = build_metrics_hash + @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" + + # initialize cpu and memory limit + if @process_incoming_stream + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + ensure_cpu_memory_capacity_set + end + end + + def build_metrics_hash + @log.debug "Building Hash of Metrics to Collect" + metrics_to_collect_arr = @metrics_to_collect.split(',').map(&:strip) + metrics_hash = metrics_to_collect_arr.map {|x| [x.downcase,true]}.to_h + @log.info "Metrics Collected : #{metrics_hash}" + return metrics_hash + end + + def shutdown + super + end + + def filter(tag, time, record) + begin + if @process_incoming_stream + object_name = record['DataItems'][0]['ObjectName'] + counter_name = record['DataItems'][0]['Collections'][0]['CounterName'] + if object_name == @@object_name_k8s_node && @metrics_to_collect_hash.key?(counter_name.downcase) + percentage_metric_value = 0.0 + + # Compute and send % CPU and Memory + metric_value = record['DataItems'][0]['Collections'][0]['Value'] + if counter_name.downcase == @@cpu_usage_nano_cores + metric_name = @@cpu_usage_milli_cores + metric_value = metric_value/1000000 + if @cpu_capacity != 0.0 + percentage_metric_value = (metric_value*1000000)*100/@cpu_capacity + end + end + + if counter_name.start_with?("memory") + metric_name = counter_name + if @memory_capacity != 0.0 + percentage_metric_value = metric_value*100/@memory_capacity + end + end + return get_metric_records(record, metric_name, metric_value, percentage_metric_value) + else + return [] + end + else + return [] + end + rescue Exception => e + @log.info "Error processing cadvisor record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [] + end + end + + def ensure_cpu_memory_capacity_set + + @log.info "ensure_cpu_memory_capacity_set @cpu_capacity #{@cpu_capacity} @memory_capacity #{@memory_capacity}" + if @cpu_capacity != 0.0 && @memory_capacity != 0.0 + @log.info "CPU And Memory Capacity are already set" + return + end + + begin + nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes?fieldSelector=metadata.name%3D#{@@hostName}").body) + rescue Exception => e + @log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + end + if !nodeInventory.nil? + cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") + if !cpu_capacity_json.nil? && !cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + @cpu_capacity = cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] + @log.info "CPU Limit #{@cpu_capacity}" + else + @log.info "Error getting cpu_capacity" + end + memory_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes") + if !memory_capacity_json.nil? && !memory_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + @memory_capacity = memory_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] + @log.info "Memory Limit #{@memory_capacity}" + else + @log.info "Error getting memory_capacity" + end + end + end + + def get_metric_records(record, metric_name, metric_value, percentage_metric_value) + records = [] + custommetricrecord = @@custom_metrics_template % { + timestamp: record['DataItems'][0]['Timestamp'], + metricName: metric_name, + hostvalue: record['DataItems'][0]['Host'], + objectnamevalue: record['DataItems'][0]['ObjectName'], + instancenamevalue: record['DataItems'][0]['InstanceName'], + metricminvalue: metric_value, + metricmaxvalue: metric_value, + metricsumvalue: metric_value + } + records.push(JSON.parse(custommetricrecord)) + + if !percentage_metric_value.nil? + additional_record = @@custom_metrics_template % { + timestamp: record['DataItems'][0]['Timestamp'], + metricName: @@metric_name_metric_percentage_name_hash[metric_name], + hostvalue: record['DataItems'][0]['Host'], + objectnamevalue: record['DataItems'][0]['ObjectName'], + instancenamevalue: record['DataItems'][0]['InstanceName'], + metricminvalue: percentage_metric_value, + metricmaxvalue: percentage_metric_value, + metricsumvalue: percentage_metric_value + } + records.push(JSON.parse(additional_record)) + end + @log.info "Metric Name: #{metric_name} Metric Value: #{metric_value} Percentage Metric Value: #{percentage_metric_value}" + return records + end + + + def filter_stream(tag, es) + new_es = MultiEventStream.new + ensure_cpu_memory_capacity_set + es.each { |time, record| + begin + filtered_records = filter(tag, time, record) + filtered_records.each {|filtered_record| + new_es.add(time, filtered_record) if filtered_record + } if filtered_records + rescue => e + router.emit_error_event(tag, time, record, e) + end + } + new_es + end + end +end diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb new file mode 100644 index 000000000..d9864bc1a --- /dev/null +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -0,0 +1,235 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. + +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + require_relative 'oms_common' + require_relative 'CustomMetricsUtils' + + class Inventory2MdmFilter < Filter + Fluent::Plugin.register_filter('filter_inventory2mdm', self) + + config_param :enable_log, :integer, :default => 0 + config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log' + config_param :custom_metrics_azure_regions, :string + + @@node_count_metric_name = 'nodesCount' + @@pod_count_metric_name = 'podCount' + @@pod_inventory_tag = 'mdm.kubepodinventory' + @@node_inventory_tag = 'mdm.kubenodeinventory' + @@node_status_ready = 'Ready' + @@node_status_not_ready = 'NotReady' + + @@node_inventory_custom_metrics_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/nodes", + "dimNames": [ + "status" + ], + "series": [ + { + "dimValues": [ + "%{statusValue}" + ], + "min": %{node_status_count}, + "max": %{node_status_count}, + "sum": %{node_status_count}, + "count": 1 + } + ] + } + } + }' + + @@pod_inventory_custom_metrics_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/pods", + "dimNames": [ + "phase", + "namespace", + "node", + "controllerName" + ], + "series": [ + { + "dimValues": [ + "%{phaseDimValue}", + "%{namespaceDimValue}", + "%{nodeDimValue}", + "%{controllerNameDimValue}" + ], + "min": %{podCountMetricValue}, + "max": %{podCountMetricValue}, + "sum": %{podCountMetricValue}, + "count": 1 + } + ] + } + } + }' + + @process_incoming_stream = true + + def initialize + super + end + + def configure(conf) + super + @log = nil + + if @enable_log + @log = Logger.new(@log_path, 'weekly') + @log.debug {'Starting filter_inventory2mdm plugin'} + end + end + + def start + super + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" + end + + def shutdown + super + end + + def process_node_inventory_records(es) + timestamp = DateTime.now + + begin + node_ready_count = 0 + node_not_ready_count = 0 + records = [] + + es.each{|time,record| + begin + timestamp = record['DataItems'][0]['CollectionTime'] + node_status = record['DataItems'][0]['Status'] + if node_status.downcase == @@node_status_ready.downcase + node_ready_count = node_ready_count+1 + else + node_not_ready_count = node_not_ready_count + 1 + end + rescue => e + end + } + + ready_record = @@node_inventory_custom_metrics_template % { + timestamp: timestamp, + metricName: @@node_count_metric_name, + statusValue: @@node_status_ready, + node_status_count: node_ready_count + } + records.push(JSON.parse(ready_record)) + + not_ready_record = @@node_inventory_custom_metrics_template % { + timestamp: timestamp, + metricName: @@node_count_metric_name, + statusValue: @@node_status_not_ready, + node_status_count: node_not_ready_count + } + records.push(JSON.parse(not_ready_record)) + rescue Exception => e + @log.info "Error processing node inventory records Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [],timestamp + end + return records,timestamp + end + + def process_pod_inventory_records(es) + timestamp = DateTime.now + pod_count_hash = Hash.new + + begin + records = [] + es.each{|time,record| + + timestamp = record['DataItems'][0]['CollectionTime'] + podPhaseDimValue = record['DataItems'][0]['PodStatus'] + podNamespaceDimValue = record['DataItems'][0]['Namespace'] + podControllerNameDimValue = record['DataItems'][0]['ControllerName'] + podNodeDimValue = record['DataItems'][0]['Computer'] + + # group by distinct dimension values + pod_key = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue, podPhaseDimValue].join('~~') + + if pod_count_hash.key?(pod_key) + pod_count = pod_count_hash[pod_key] + pod_count = pod_count + 1 + pod_count_hash[pod_key] = pod_count + else + pod_count = 1 + pod_count_hash[pod_key] = pod_count + end + } + + pod_count_hash.each {|key, value| + + key_elements = key.split('~~') + if key_elements.length != 4 + next + end + + # get dimension values by key + podNodeDimValue = key_elements[0] + podNamespaceDimValue = key_elements[1] + podControllerNameDimValue = key_elements[2] + podPhaseDimValue = key_elements[3] + + record = @@pod_inventory_custom_metrics_template % { + timestamp: timestamp, + metricName: @@pod_count_metric_name, + phaseDimValue: podPhaseDimValue, + namespaceDimValue: podNamespaceDimValue, + nodeDimValue: podNodeDimValue, + controllerNameDimValue: podControllerNameDimValue, + podCountMetricValue: value + } + records.push(JSON.parse(record)) + } + rescue Exception => e + @log.info "Error processing pod inventory record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [],timestamp + end + return records, timestamp + end + + def filter_stream(tag, es) + new_es = MultiEventStream.new + filtered_records = [] + time = DateTime.now + begin + if @process_incoming_stream + @log.info 'Processing NODE inventory records in filter plugin to send to MDM' + if tag.downcase.start_with?(@@node_inventory_tag) + filtered_records, time = process_node_inventory_records(es) + elsif tag.downcase.start_with?(@@pod_inventory_tag) + @log.info 'Processing POD inventory records in filter plugin to send to MDM' + filtered_records, time = process_pod_inventory_records(es) + else + filtered_records = [] + end + end + filtered_records.each {|filtered_record| + new_es.add(time, filtered_record) if filtered_record + } if filtered_records + rescue => e + @log.info "Exception in filter_stream #{e}" + end + new_es + end + end +end diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index 5b551f74e..a857aa6b9 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -18,6 +18,7 @@ def initialize config_param :run_interval, :time, :default => '1m' config_param :tag, :string, :default => "oms.api.cadvisorperf" + config_param :mdmtag, :string, :default => "mdm.cadvisorperf" def configure (conf) super @@ -55,6 +56,7 @@ def enumerate() end router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream @@istestvar = ENV['ISTEST'] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 2e48e3f1f..ba1dacbe0 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -7,6 +7,7 @@ class Kube_nodeInventory_Input < Input Plugin.register_input('kubenodeinventory', self) @@ContainerNodeInventoryTag = 'oms.containerinsights.ContainerNodeInventory' + @@MDMKubeNodeInventoryTag = 'mdm.kubenodeinventory' def initialize super @@ -136,6 +137,7 @@ def enumerate end end router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index eaf14b035..dee3df30b 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -6,6 +6,8 @@ module Fluent class Kube_PodInventory_Input < Input Plugin.register_input('kubepodinventory', self) + @@MDMKubePodInventoryTag = 'mdm.kubepodinventory' + def initialize super require 'yaml' @@ -208,6 +210,7 @@ def parse_and_emit_records(podInventory, serviceList) end end #podInventory block end router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@MDMKubePodInventoryTag, eventStream) if eventStream if telemetryFlush == true ApplicationInsightsUtility.sendHeartBeatEvent("KubePodInventory") ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory['items'].length , {}) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb new file mode 100644 index 000000000..2f36ea7d5 --- /dev/null +++ b/source/code/plugin/out_mdm.rb @@ -0,0 +1,239 @@ +module Fluent + + class OutputMDM < BufferedOutput + + config_param :retry_mdm_post_wait_minutes, :integer + + Plugin.register_output('out_mdm', self) + + def initialize + super + require 'net/http' + require 'net/https' + require 'uri' + require 'json' + require_relative 'KubernetesApiClient' + require_relative 'ApplicationInsightsUtility' + + @@token_resource_url = 'https://monitoring.azure.com/' + @@grant_type = 'client_credentials' + @@azure_json_path = '/etc/kubernetes/host/azure.json' + @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" + @@token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token" + @@plugin_name = "AKSCustomMetricsMDM" + + @data_hash = {} + @token_url = nil + @http_client = nil + @token_expiry_time = Time.now + @cached_access_token = String.new + @last_post_attempt_time = Time.now + @first_post_attempt_made = false + end + + def configure(conf) + s = conf.add_element("secondary") + s["type"] = ChunkErrorHandler::SecondaryName + super + end + + def start + super + file = File.read(@@azure_json_path) + # Handle the case where the file read fails. Send Telemetry and exit the plugin? + @data_hash = JSON.parse(file) + @token_url = @@token_url_template % {tenant_id: @data_hash['tenantId']} + @cached_access_token = get_access_token + aks_resource_id = ENV['AKS_RESOURCE_ID'] + aks_region = ENV['AKS_REGION'] + if aks_resource_id.to_s.empty? + @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " + raise Exception.new "Environment Variable AKS_RESOURCE_ID is not set!!" + end + if aks_region.to_s.empty? + @log.info "Environment Variable AKS_REGION is not set.. " + raise Exception.new "Environment Variable AKS_REGION is not set!!" + end + + @@post_request_url = @@post_request_url_template % {aks_region: aks_region, aks_resource_id: aks_resource_id} + @post_request_uri = URI.parse(@@post_request_url) + @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) + @http_client.use_ssl = true + @log.info "POST Request url: #{@@post_request_url}" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) + end + + # get the access token only if the time to expiry is less than 5 minutes + def get_access_token + if @cached_access_token.to_s.empty? || (Time.now + 5*60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration + @log.info "Refreshing access token for out_mdm plugin.." + token_uri = URI.parse(@token_url) + http_access_token = Net::HTTP.new(token_uri.host, token_uri.port) + http_access_token.use_ssl = true + token_request = Net::HTTP::Post.new(token_uri.request_uri) + token_request.set_form_data( + { + 'grant_type' => @@grant_type, + 'client_id' => @data_hash['aadClientId'], + 'client_secret' => @data_hash['aadClientSecret'], + 'resource' => @@token_resource_url + } + ) + + token_response = http_access_token.request(token_request) + # Handle the case where the response is not 200 + parsed_json = JSON.parse(token_response.body) + @token_expiry_time = Time.now + 59*60 # set the expiry time to be ~one hour from current time + @cached_access_token = parsed_json['access_token'] + end + @cached_access_token + end + + def write_status_file(success, message) + fn = '/var/opt/microsoft/omsagent/log/MDMIngestion.status' + status = '{ "operation": "MDMIngestion", "success": "%s", "message": "%s" }' % [success, message] + begin + File.open(fn,'w') { |file| file.write(status) } + rescue => e + @log.debug "Error:'#{e}'" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + end + end + + # This method is called when an event reaches to Fluentd. + # Convert the event to a raw string. + def format(tag, time, record) + if record != {} + @log.trace "Buffering #{tag}" + return [tag, record].to_msgpack + else + return "" + end + end + + # This method is called every flush interval. Send the buffer chunk to MDM. + # 'chunk' is a buffer chunk that includes multiple formatted records + def write(chunk) + begin + if !@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes*60) + post_body = [] + chunk.msgpack_each {|(tag, record)| + post_body.push(record.to_json) + } + send_to_mdm post_body + else + @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time)/60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" + end + rescue Exception => e + @log.info "Exception when writing to MDM: #{e}" + end + end + + def send_to_mdm(post_body) + begin + access_token = get_access_token + request = Net::HTTP::Post.new(@post_request_uri.request_uri) + request['Content-Type'] = "application/x-ndjson" + request['Authorization'] = "Bearer #{access_token}" + request.body = post_body.join("\n") + response = @http_client.request(request) + response.value # this throws for non 200 HTTP response code + @log.info "HTTP Post Response Code : #{response.code}" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) + rescue Net::HTTPServerException => e + @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" + @log.debug_backtrace(e.backtrace) + if !response.code.empty? && response.code == 403.to_s + @log.info "Response Code #{response.code} Updating @last_post_attempt_time" + @last_post_attempt_time = Time.now + @first_post_attempt_made = true + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + # Not raising exception, as that will cause retries to happen + else + @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" + raise e + end + rescue Errno::ETIMEDOUT => e + @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}" + @log.debug_backtrace(e.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + raise e + rescue Exception => e + @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}" + @log.debug_backtrace(e.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + raise e + end + end + private + + class ChunkErrorHandler + include Configurable + include PluginId + include PluginLoggerMixin + + SecondaryName = "__ChunkErrorHandler__" + + Plugin.register_output(SecondaryName, self) + + def initialize + @router = nil + end + + def secondary_init(primary) + @error_handlers = create_error_handlers @router + end + + def start + # NOP + end + + def shutdown + # NOP + end + + def router=(r) + @router = r + end + + def write(chunk) + chunk.msgpack_each {|(tag, record)| + @error_handlers[tag].emit(record) + } + end + + private + + def create_error_handlers(router) + nop_handler = NopErrorHandler.new + Hash.new() { |hash, tag| + etag = OMS::Common.create_error_tag tag + hash[tag] = router.match?(etag) ? + ErrorHandler.new(router, etag) : + nop_handler + } + end + + class ErrorHandler + def initialize(router, etag) + @router = router + @etag = etag + end + + def emit(record) + @router.emit(@etag, Fluent::Engine.now, record) + end + end + + class NopErrorHandler + def emit(record) + # NOP + end + end + + end + + end # class OutputMDM + +end # module Fluent + From f1b0cd2a1945057340dc48f85ea685b3a5a69b08 Mon Sep 17 00:00:00 2001 From: Kaveesh Dubey Date: Thu, 24 Jan 2019 12:12:01 -0800 Subject: [PATCH 058/160] add ContainerNotRunning column to KubePodInventory --- source/code/plugin/in_kube_podinventory.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index dee3df30b..9b8ee1fb8 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -173,6 +173,7 @@ def parse_and_emit_records(podInventory, serviceList) containerRestartCount = container['restartCount'] record['ContainerRestartCount'] = containerRestartCount containerStatus = container['state'] + record['ContainerNotRunningReason'] = '' # state is of the following form , so just picking up the first key name # "state": { # "waiting": { @@ -190,6 +191,10 @@ def parse_and_emit_records(podInventory, serviceList) #Picking up both container and node start time from cAdvisor to be consistent if containerStatus.keys[0] == "running" record['ContainerCreationTimeStamp'] = container['state']['running']['startedAt'] + else + if !containerStatus[containerStatus.keys[0]]['reason'].nil? && !containerStatus[containerStatus.keys[0]]['reason'].empty? + record['ContainerNotRunningReason'] = containerStatus[containerStatus.keys[0]]['reason'] + end end podRestartCount += containerRestartCount records.push(record.dup) From 616a803a4c962511a2a27e3f8382b8b82c09362c Mon Sep 17 00:00:00 2001 From: Kaveesh Dubey Date: Thu, 24 Jan 2019 13:52:38 -0800 Subject: [PATCH 059/160] merge pr feedback: update name to ContainerStatusReason --- source/code/plugin/in_kube_podinventory.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 9b8ee1fb8..3d026b05f 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -173,7 +173,7 @@ def parse_and_emit_records(podInventory, serviceList) containerRestartCount = container['restartCount'] record['ContainerRestartCount'] = containerRestartCount containerStatus = container['state'] - record['ContainerNotRunningReason'] = '' + record['ContainerStatusReason'] = '' # state is of the following form , so just picking up the first key name # "state": { # "waiting": { @@ -193,7 +193,7 @@ def parse_and_emit_records(podInventory, serviceList) record['ContainerCreationTimeStamp'] = container['state']['running']['startedAt'] else if !containerStatus[containerStatus.keys[0]]['reason'].nil? && !containerStatus[containerStatus.keys[0]]['reason'].empty? - record['ContainerNotRunningReason'] = containerStatus[containerStatus.keys[0]]['reason'] + record['ContainerStatusReason'] = containerStatus[containerStatus.keys[0]]['reason'] end end podRestartCount += containerRestartCount From c33ca34233f9adbe02b55c36e7148258041f997d Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 19 Feb 2019 13:10:03 -0800 Subject: [PATCH 060/160] Zero Fill for Missing Pod Phases, Change Namespace Dimension to Kubernetes namespace, as it might be confused with metrics namespace in Metrics Explorer (#194) * Zero Fill for Pod Counts by Phase * Change namespace dimension to Kubernetes namespace --- source/code/plugin/filter_inventory2mdm.rb | 31 +++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb index d9864bc1a..8aaa5ff01 100644 --- a/source/code/plugin/filter_inventory2mdm.rb +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -56,7 +56,7 @@ class Inventory2MdmFilter < Filter "namespace": "insights.container/pods", "dimNames": [ "phase", - "namespace", + "Kubernetes namespace", "node", "controllerName" ], @@ -77,7 +77,9 @@ class Inventory2MdmFilter < Filter } } }' - + + @@pod_phase_values = ['Running', 'Pending', 'Succeeded', 'Failed', 'Unknown'] + @process_incoming_stream = true def initialize @@ -151,7 +153,7 @@ def process_node_inventory_records(es) def process_pod_inventory_records(es) timestamp = DateTime.now pod_count_hash = Hash.new - + no_phase_dim_values_hash = Hash.new begin records = [] es.each{|time,record| @@ -173,6 +175,29 @@ def process_pod_inventory_records(es) pod_count = 1 pod_count_hash[pod_key] = pod_count end + + # Collect all possible combinations of dimension values other than pod phase + key_without_phase_dim_value = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue].join('~~') + if no_phase_dim_values_hash.key?(key_without_phase_dim_value) + @log.info "#{key_without_phase_dim_value} already present in #{no_phase_dim_values_hash}" + next + else + @log.info "Adding #{key_without_phase_dim_value} to #{no_phase_dim_values_hash}" + no_phase_dim_values_hash[key_without_phase_dim_value] = true + end + } + + # generate all possible values of non_phase_dim_values X pod Phases and zero-fill the ones that are not already present + no_phase_dim_values_hash.each {|key, value| + @@pod_phase_values.each{|phase| + pod_key = [key, phase].join('~~') + if !pod_count_hash.key?(pod_key) + pod_count_hash[pod_key] = 0 + @log.info "Zero filled #{pod_key}" + else + next + end + } } pod_count_hash.each {|key, value| From 2651750f04932a808a214f84cc7a5742fd075591 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 20 Feb 2019 13:31:23 -0800 Subject: [PATCH 061/160] No Retries for non 404 4xx errors (#196) --- source/code/plugin/out_mdm.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 2f36ea7d5..6bde98534 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -126,6 +126,7 @@ def write(chunk) end rescue Exception => e @log.info "Exception when writing to MDM: #{e}" + raise e end end @@ -149,7 +150,11 @@ def send_to_mdm(post_body) @first_post_attempt_made = true ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) # Not raising exception, as that will cause retries to happen - else + elsif !response.code.empty? && response.code.start_with?('4') + # Log 400 errors and continue + @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" + else + # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" raise e end From 195bc3382342c2dfe1f7bd28e623486553b5d59f Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 14:51:56 -0800 Subject: [PATCH 062/160] Update agent version for telemetry --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 29c98bdf1..863e2d86a 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -28,5 +28,5 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod01092019 + AgentVersion ciprod01202019 From 59d6c61e6a5d0841333dca6a685fd0e633b9b53c Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 15:51:09 -0800 Subject: [PATCH 063/160] Update readme for upcoming (ciprod01202019) release --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 099a065e8..8b5898e92 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,22 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 01/20/2019 - Version microsoft/oms:ciprod01202019 +- Container logs enrichment optimization +- Get container meta data only for containers in current node (vs cluster before) +- Update fluent bit 0.13.7 => 0.14.4 +- This fixes the escaping issue in the container logs +- Mooncake cloud support for agent +- Ability to disable agent telemetry +- Ability to onboard and ingest to mooncake cloud +- Add & populate 'ContainerStatusReason' column to KubePodInventory +- Alertable (custom) metrics (to AzureMonitor - only for AKS clusters) +- Cpuusagenanocores & % +- MemoryWorkingsetBytes & % +- MemoryRssBytes & % +- Podcount by node, phase & namespace +- Nodecount + ### 01/09/2018 - Version microsoft/oms:ciprod01092019 - Omsagent - 1.8.1.256 (nov 2018 release) - Persist fluentbit state between container restarts From 0189bc0a7a8cc5bd1f657baea8a12895e5861ffe Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 15:53:35 -0800 Subject: [PATCH 064/160] fix readme formatting --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8b5898e92..14c07e948 100644 --- a/README.md +++ b/README.md @@ -13,19 +13,19 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ### 01/20/2019 - Version microsoft/oms:ciprod01202019 - Container logs enrichment optimization -- Get container meta data only for containers in current node (vs cluster before) +..*Get container meta data only for containers in current node (vs cluster before) - Update fluent bit 0.13.7 => 0.14.4 -- This fixes the escaping issue in the container logs +..*This fixes the escaping issue in the container logs - Mooncake cloud support for agent -- Ability to disable agent telemetry -- Ability to onboard and ingest to mooncake cloud +..*Ability to disable agent telemetry +..*Ability to onboard and ingest to mooncake cloud - Add & populate 'ContainerStatusReason' column to KubePodInventory - Alertable (custom) metrics (to AzureMonitor - only for AKS clusters) -- Cpuusagenanocores & % -- MemoryWorkingsetBytes & % -- MemoryRssBytes & % -- Podcount by node, phase & namespace -- Nodecount +..*Cpuusagenanocores & % +..*MemoryWorkingsetBytes & % +..*MemoryRssBytes & % +..*Podcount by node, phase & namespace +..*Nodecount ### 01/09/2018 - Version microsoft/oms:ciprod01092019 - Omsagent - 1.8.1.256 (nov 2018 release) From 8221d2dd849427a08c0dcd6781cd050a8380c551 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 15:54:08 -0800 Subject: [PATCH 065/160] fix formatting for readme --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 14c07e948..1a4506f1e 100644 --- a/README.md +++ b/README.md @@ -13,19 +13,19 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ### 01/20/2019 - Version microsoft/oms:ciprod01202019 - Container logs enrichment optimization -..*Get container meta data only for containers in current node (vs cluster before) +..* Get container meta data only for containers in current node (vs cluster before) - Update fluent bit 0.13.7 => 0.14.4 -..*This fixes the escaping issue in the container logs +..* This fixes the escaping issue in the container logs - Mooncake cloud support for agent -..*Ability to disable agent telemetry -..*Ability to onboard and ingest to mooncake cloud +..* Ability to disable agent telemetry +..* Ability to onboard and ingest to mooncake cloud - Add & populate 'ContainerStatusReason' column to KubePodInventory - Alertable (custom) metrics (to AzureMonitor - only for AKS clusters) -..*Cpuusagenanocores & % -..*MemoryWorkingsetBytes & % -..*MemoryRssBytes & % -..*Podcount by node, phase & namespace -..*Nodecount +..* Cpuusagenanocores & % +..* MemoryWorkingsetBytes & % +..* MemoryRssBytes & % +..* Podcount by node, phase & namespace +..* Nodecount ### 01/09/2018 - Version microsoft/oms:ciprod01092019 - Omsagent - 1.8.1.256 (nov 2018 release) From 30aa305a0546474d55889ea63c7ab8ef84ae9dca Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 15:57:17 -0800 Subject: [PATCH 066/160] fix formatting for readme --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1a4506f1e..ab621104a 100644 --- a/README.md +++ b/README.md @@ -13,19 +13,19 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ### 01/20/2019 - Version microsoft/oms:ciprod01202019 - Container logs enrichment optimization -..* Get container meta data only for containers in current node (vs cluster before) + * Get container meta data only for containers in current node (vs cluster before) - Update fluent bit 0.13.7 => 0.14.4 -..* This fixes the escaping issue in the container logs + * This fixes the escaping issue in the container logs - Mooncake cloud support for agent -..* Ability to disable agent telemetry -..* Ability to onboard and ingest to mooncake cloud + * Ability to disable agent telemetry + * Ability to onboard and ingest to mooncake cloud - Add & populate 'ContainerStatusReason' column to KubePodInventory - Alertable (custom) metrics (to AzureMonitor - only for AKS clusters) -..* Cpuusagenanocores & % -..* MemoryWorkingsetBytes & % -..* MemoryRssBytes & % -..* Podcount by node, phase & namespace -..* Nodecount + * Cpuusagenanocores & % + * MemoryWorkingsetBytes & % + * MemoryRssBytes & % + * Podcount by node, phase & namespace + * Nodecount ### 01/09/2018 - Version microsoft/oms:ciprod01092019 - Omsagent - 1.8.1.256 (nov 2018 release) From f401116124985b1c24f56557f957f00da423d6cd Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 15:59:29 -0800 Subject: [PATCH 067/160] fix readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ab621104a..125aec3bb 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,16 @@ Note : The agent version(s) below has dates (ciprod), which indicate t * Get container meta data only for containers in current node (vs cluster before) - Update fluent bit 0.13.7 => 0.14.4 * This fixes the escaping issue in the container logs -- Mooncake cloud support for agent +- Mooncake cloud support for agent (AKS only) * Ability to disable agent telemetry * Ability to onboard and ingest to mooncake cloud - Add & populate 'ContainerStatusReason' column to KubePodInventory - Alertable (custom) metrics (to AzureMonitor - only for AKS clusters) - * Cpuusagenanocores & % - * MemoryWorkingsetBytes & % - * MemoryRssBytes & % - * Podcount by node, phase & namespace - * Nodecount + * Cpuusagenanocores & % metric + * MemoryWorkingsetBytes & % metric + * MemoryRssBytes & % metric + * Podcount by node, phase & namespace metric + * Nodecount metric ### 01/09/2018 - Version microsoft/oms:ciprod01092019 - Omsagent - 1.8.1.256 (nov 2018 release) From a2f45afdac70173c994d73cd88ba34b20cd817d9 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 16:13:57 -0800 Subject: [PATCH 068/160] fix readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 125aec3bb..4313de5c0 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 01/20/2019 - Version microsoft/oms:ciprod01202019 +### 01/20/2019 - Version microsoft/oms:ciprod02202019 - Container logs enrichment optimization * Get container meta data only for containers in current node (vs cluster before) - Update fluent bit 0.13.7 => 0.14.4 From 759dbb57e1472df8476ad7acfd8fbc9231207e3a Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 16:14:48 -0800 Subject: [PATCH 069/160] fix agent version for telemetry --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 863e2d86a..467489d1c 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -28,5 +28,5 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod01202019 + AgentVersion ciprod02202019 From 7956f40d075476dc85633b53d72ed4eb8dfdc303 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 20 Feb 2019 17:16:25 -0800 Subject: [PATCH 070/160] fix date in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4313de5c0..59faf7e4d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 01/20/2019 - Version microsoft/oms:ciprod02202019 +### 02/20/2019 - Version microsoft/oms:ciprod02202019 - Container logs enrichment optimization * Get container meta data only for containers in current node (vs cluster before) - Update fluent bit 0.13.7 => 0.14.4 From ee056568eee328b2d37a0d7a75e1ccec370f1729 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 21 Feb 2019 09:15:08 -0800 Subject: [PATCH 071/160] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 59faf7e4d..b8d08b05a 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t * MemoryRssBytes & % metric * Podcount by node, phase & namespace metric * Nodecount metric +- ContainerNodeInventory_CL to fixed type ### 01/09/2018 - Version microsoft/oms:ciprod01092019 - Omsagent - 1.8.1.256 (nov 2018 release) From 2abcf67413b7c3fcbc8d1cd80511e1566fc124ba Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 21 Feb 2019 12:56:09 -0800 Subject: [PATCH 072/160] Restart logs every 10MB instead of weekly (#198) * Rotate logs every 10MB instead of weekly * Removing some logging, fixed log rotation --- source/code/plugin/filter_cadvisor2mdm.rb | 3 +-- source/code/plugin/filter_inventory2mdm.rb | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index 85f9f688e..94f2107cc 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -63,7 +63,7 @@ def configure(conf) @log = nil if @enable_log - @log = Logger.new(@log_path, 'weekly') + @log = Logger.new(@log_path, 1, 5000000) @log.debug {'Starting filter_cadvisor2mdm plugin'} end end @@ -191,7 +191,6 @@ def get_metric_records(record, metric_name, metric_value, percentage_metric_valu } records.push(JSON.parse(additional_record)) end - @log.info "Metric Name: #{metric_name} Metric Value: #{metric_value} Percentage Metric Value: #{percentage_metric_value}" return records end diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb index 8aaa5ff01..84f12dd06 100644 --- a/source/code/plugin/filter_inventory2mdm.rb +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -91,7 +91,7 @@ def configure(conf) @log = nil if @enable_log - @log = Logger.new(@log_path, 'weekly') + @log = Logger.new(@log_path, 1, 5000000) @log.debug {'Starting filter_inventory2mdm plugin'} end end @@ -179,10 +179,8 @@ def process_pod_inventory_records(es) # Collect all possible combinations of dimension values other than pod phase key_without_phase_dim_value = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue].join('~~') if no_phase_dim_values_hash.key?(key_without_phase_dim_value) - @log.info "#{key_without_phase_dim_value} already present in #{no_phase_dim_values_hash}" next else - @log.info "Adding #{key_without_phase_dim_value} to #{no_phase_dim_values_hash}" no_phase_dim_values_hash[key_without_phase_dim_value] = true end } From 18c107c4678cbbc53f14829458e781cc3b07d2c3 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 21 Feb 2019 13:30:42 -0800 Subject: [PATCH 073/160] update agent version for telemetry --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 467489d1c..974e8564a 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -28,5 +28,5 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod02202019 + AgentVersion ciprod02212019 From 14b2b87c15bd4d49e2e5982789a5ba2649b3fc32 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 21 Feb 2019 13:33:02 -0800 Subject: [PATCH 074/160] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b8d08b05a..f72a16f1e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 02/20/2019 - Version microsoft/oms:ciprod02202019 +### 02/21/2019 - Version microsoft/oms:ciprod02212019 - Container logs enrichment optimization * Get container meta data only for containers in current node (vs cluster before) - Update fluent bit 0.13.7 => 0.14.4 From 5479dff7a93cc8f640412a90cac8523c283c201d Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 22 Feb 2019 11:44:15 -0800 Subject: [PATCH 075/160] Update kube.conf to use %STATE_DIR_WS% instead of hardcoded path --- installer/conf/kube.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 31a0778d3..454df6e91 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -159,7 +159,7 @@ num_threads 5 buffer_chunk_limit 20m buffer_type file - buffer_path /var/opt/microsoft/omsagent/6bb1e963-b08c-43a8-b708-1628305e964a/state/out_mdm_*.buffer + buffer_path %STATE_DIR_WS%/out_mdm_*.buffer buffer_queue_limit 20 buffer_queue_full_action drop_oldest_chunk flush_interval 20s @@ -167,4 +167,4 @@ retry_wait 30s max_retry_wait 9m retry_mdm_post_wait_minutes 60 - \ No newline at end of file + From cdded2ee004d2c72e09cb881448dfc4fde49332f Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 4 Mar 2019 15:38:18 -0800 Subject: [PATCH 076/160] Fix AKSEngine Crash (#200) --- source/code/plugin/CustomMetricsUtils.rb | 4 ++-- source/code/plugin/out_mdm.rb | 23 ++++++++++++++++++----- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/source/code/plugin/CustomMetricsUtils.rb b/source/code/plugin/CustomMetricsUtils.rb index d06c9ad91..a19580630 100644 --- a/source/code/plugin/CustomMetricsUtils.rb +++ b/source/code/plugin/CustomMetricsUtils.rb @@ -9,8 +9,8 @@ class << self def check_custom_metrics_availability(custom_metric_regions) aks_region = ENV['AKS_REGION'] aks_resource_id = ENV['AKS_RESOURCE_ID'] - if aks_region.to_s.empty? && aks_resource_id.to_s.empty? - false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set + if aks_region.to_s.empty? || aks_resource_id.to_s.empty? + return false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set end custom_metrics_regions_arr = custom_metric_regions.split(',') diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 6bde98534..274f450fd 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -29,6 +29,7 @@ def initialize @cached_access_token = String.new @last_post_attempt_time = Time.now @first_post_attempt_made = false + @can_send_data_to_mdm = true end def configure(conf) @@ -39,7 +40,13 @@ def configure(conf) def start super - file = File.read(@@azure_json_path) + begin + file = File.read(@@azure_json_path) + rescue => e + @log.info "Unable to read file #{@@azure_json_path} #{e}" + @can_send_data_to_mdm = false + return + end # Handle the case where the file read fails. Send Telemetry and exit the plugin? @data_hash = JSON.parse(file) @token_url = @@token_url_template % {tenant_id: @data_hash['tenantId']} @@ -48,11 +55,13 @@ def start aks_region = ENV['AKS_REGION'] if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " - raise Exception.new "Environment Variable AKS_RESOURCE_ID is not set!!" + @can_send_data_to_mdm = false + return end if aks_region.to_s.empty? @log.info "Environment Variable AKS_REGION is not set.. " - raise Exception.new "Environment Variable AKS_REGION is not set!!" + @can_send_data_to_mdm = false + return end @@post_request_url = @@post_request_url_template % {aks_region: aks_region, aks_resource_id: aks_resource_id} @@ -115,14 +124,18 @@ def format(tag, time, record) # 'chunk' is a buffer chunk that includes multiple formatted records def write(chunk) begin - if !@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes*60) + if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes*60)) && @can_send_data_to_mdm post_body = [] chunk.msgpack_each {|(tag, record)| post_body.push(record.to_json) } send_to_mdm post_body else - @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time)/60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" + if !@can_send_data_to_mdm + @log.info "Cannot send data to MDM since all required conditions were not met" + else + @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time)/60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" + end end rescue Exception => e @log.info "Exception when writing to MDM: #{e}" From 57be1c4be9f3a6234a9aff130da2ef327c958d1c Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 12 Mar 2019 17:47:17 -0700 Subject: [PATCH 077/160] hotfix * close resp.Body * remove chatty logs * membuf=5m and ignore files not updated since 5 mins --- installer/conf/td-agent-bit.conf | 7 ++++--- source/code/go/src/plugins/oms.go | 11 ++++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index f01857cd7..9175b68ce 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -10,16 +10,17 @@ Path /var/log/containers/*.log DB /var/log/omsagent-fblogs.db Parser docker - Mem_Buf_Limit 30m + Mem_Buf_Limit 5m Path_Key filepath Skip_Long_Lines On + Ignore_Older 5m [INPUT] Name tail Tag oms.container.log.flbplugin.* Path /var/log/containers/omsagent*.log DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db - Mem_Buf_Limit 30m + Mem_Buf_Limit 2m Path_Key filepath Skip_Long_Lines On @@ -28,6 +29,6 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod02212019 + AgentVersion ciprod03122019 diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index d913c6c32..36cf20273 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -246,16 +246,11 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if val, ok := imageIDMap[containerID]; ok { stringMap["Image"] = val - } else { - Log("ContainerId %s not present in Name Map ", containerID) - } + } if val, ok := nameIDMap[containerID]; ok { stringMap["Name"] = val - } else { - Log("ContainerId %s not present in Image Map ", containerID) - } - + } dataItem := DataItem{ ID: stringMap["Id"], @@ -319,6 +314,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { return output.FLB_RETRY } + defer resp.Body.Close() + numRecords := len(dataItems) Log("Successfully flushed %d records in %s", numRecords, elapsed) ContainerLogTelemetryMutex.Lock() From 940a6eb2c1adc215e0dccdc33579159a961f4b9a Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 12 Mar 2019 17:59:57 -0700 Subject: [PATCH 078/160] fix readme for new version --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index f72a16f1e..0a0b9ce08 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,13 @@ additional questions or comments. ## Release History Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) + +### 03/12/2019 - Version microsoft/oms:ciprod03122019 +- Fix for closing response.Body in outoms +- Update Mem_Buf_Limit to 5m for fluentbit +- Tail only files that were modified since 5 minutes +- Remove some unwanted logs that are chatty in outoms +- Fix for MDM disablement for AKS-Engine ### 02/21/2019 - Version microsoft/oms:ciprod02212019 - Container logs enrichment optimization From 411582432119d9d2ace3b8f3b9b0a2aad12089c5 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 13 Mar 2019 11:25:12 -0700 Subject: [PATCH 079/160] Fix the pod count in mdm agent plugin (#203) --- source/code/plugin/filter_inventory2mdm.rb | 35 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb index 84f12dd06..553c857b7 100644 --- a/source/code/plugin/filter_inventory2mdm.rb +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -154,20 +154,42 @@ def process_pod_inventory_records(es) timestamp = DateTime.now pod_count_hash = Hash.new no_phase_dim_values_hash = Hash.new + total_pod_count = 0 + pod_count_by_phase = {} + podUids = {} + record_count = 0 begin records = [] es.each{|time,record| - + record_count += 1 timestamp = record['DataItems'][0]['CollectionTime'] + podUid = record['DataItems'][0]['PodUid'] + + if podUids.key?(podUid) + #@log.info "pod with #{podUid} already counted" + next + end + + podUids[podUid] = true podPhaseDimValue = record['DataItems'][0]['PodStatus'] podNamespaceDimValue = record['DataItems'][0]['Namespace'] podControllerNameDimValue = record['DataItems'][0]['ControllerName'] podNodeDimValue = record['DataItems'][0]['Computer'] - + # group by distinct dimension values pod_key = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue, podPhaseDimValue].join('~~') - - if pod_count_hash.key?(pod_key) + + if pod_count_by_phase.key?(podPhaseDimValue) + phase_count = pod_count_by_phase[podPhaseDimValue] + phase_count += 1 + pod_count_by_phase[podPhaseDimValue] = phase_count + else + pod_count_by_phase[podPhaseDimValue] = 1 + end + + total_pod_count += 1 + + if pod_count_hash.key?(pod_key) pod_count = pod_count_hash[pod_key] pod_count = pod_count + 1 pod_count_hash[pod_key] = pod_count @@ -175,7 +197,7 @@ def process_pod_inventory_records(es) pod_count = 1 pod_count_hash[pod_key] = pod_count end - + # Collect all possible combinations of dimension values other than pod phase key_without_phase_dim_value = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue].join('~~') if no_phase_dim_values_hash.key?(key_without_phase_dim_value) @@ -191,7 +213,7 @@ def process_pod_inventory_records(es) pod_key = [key, phase].join('~~') if !pod_count_hash.key?(pod_key) pod_count_hash[pod_key] = 0 - @log.info "Zero filled #{pod_key}" + #@log.info "Zero filled #{pod_key}" else next end @@ -227,6 +249,7 @@ def process_pod_inventory_records(es) ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) return [],timestamp end + @log.info "Record Count #{record_count} pod count = #{total_pod_count} Pod Count To Phase #{pod_count_by_phase} " return records, timestamp end From df2e64c19bc9e427c72ffe492375b598a8933bfe Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 13 Mar 2019 11:27:48 -0700 Subject: [PATCH 080/160] Update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0a0b9ce08..916863dbf 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Tail only files that were modified since 5 minutes - Remove some unwanted logs that are chatty in outoms - Fix for MDM disablement for AKS-Engine +- Fix for Pod count metric (same as container count) in MDM ### 02/21/2019 - Version microsoft/oms:ciprod02212019 - Container logs enrichment optimization From 19c2bc7864a4aabade944c327101ddc789850059 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Wed, 13 Mar 2019 12:13:12 -0700 Subject: [PATCH 081/160] string freeze for out_mdm plugin --- source/code/plugin/out_mdm.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 274f450fd..93b32ef50 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -1,3 +1,6 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + module Fluent class OutputMDM < BufferedOutput From 69935b305ab3552bc8626c8f81a802ec559a31e4 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 1 Apr 2019 11:09:27 -0700 Subject: [PATCH 082/160] Vishwa/resourcecentric (#208) * resourceid fix (for AKS only) * fix name --- source/code/go/src/plugins/oms.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 36cf20273..a1ca3d6ee 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -44,6 +44,10 @@ var ( Computer string // WorkspaceID log analytics workspace id WorkspaceID string + // ResourceID for resource-centric log analytics data + ResourceID string + // Resource-centric flag (will be true if we determine if above RseourceID is non-empty - default is false) + ResourceCentric bool ) var ( @@ -294,6 +298,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) req.Header.Set("Content-Type", "application/json") + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } resp, err := HTTPClient.Do(req) elapsed := time.Since(start) @@ -377,6 +385,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] WorkspaceID = omsadminConf["WORKSPACE_ID"] + ResourceID = os.Getenv("customResourceId") + if len(ResourceID) > 0 { + ResourceCentric = true + Log("OMS ResourceId=%s",ResourceID) + } Log("OMSEndpoint %s", OMSEndpoint) // Initialize image,name map refresh ticker From 6953f50a62c7faade0db553e0839f137b252309b Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 1 Apr 2019 14:48:19 -0700 Subject: [PATCH 083/160] Rashmi/win nodepool - PR (#206) * changes for win nodes enumeration * changes * changes * changes * node cpu metric rate changes * container cpu rate * changes * changes * changes * changes * changes * changes to include in_win_cadvisor_perf.rb file * send containerinventoryheartbeatevent * changes * cahnges for mdm metrics * changes * cahnges * changes * container states * changes * changes * changes for env variables * changes * changes * changes * changes * delete comments * changes * mutex changes * changes * changes * changes * telemetry fix for docker version * removing hardcoded values for mdm * update docker version * telemetry for windows cadvisor timeouts * exeception key update to computer * PR comments --- installer/conf/kube.conf | 47 + installer/datafiles/base_container.data | 1 + .../code/plugin/ApplicationInsightsUtility.rb | 379 +++--- .../code/plugin/CAdvisorMetricsAPIClient.rb | 1020 ++++++++++------- source/code/plugin/KubernetesApiClient.rb | 938 +++++++-------- source/code/plugin/in_cadvisor_perf.rb | 152 ++- source/code/plugin/in_containerinventory.rb | 179 ++- source/code/plugin/in_kube_nodes.rb | 319 +++--- source/code/plugin/in_kube_podinventory.rb | 397 ++++--- source/code/plugin/in_win_cadvisor_perf.rb | 120 ++ source/code/plugin/out_mdm.rb | 94 +- 11 files changed, 2096 insertions(+), 1550 deletions(-) create mode 100644 source/code/plugin/in_win_cadvisor_perf.rb diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 454df6e91..0dfa3710e 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -47,12 +47,44 @@ log_level debug +#cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60s + log_level debug + + type filter_inventory2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope log_level info +#custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + type out_oms log_level debug @@ -168,3 +200,18 @@ max_retry_wait 9m retry_mdm_post_wait_minutes 60 + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index c263aa505..9c4d563f8 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -34,6 +34,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/code/plugin/CAdvisorMetricsAPIClient.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_perf.rb; source/code/plugin/in_kube_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/code/plugin/in_cadvisor_perf.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/code/plugin/in_win_cadvisor_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_services.rb; source/code/plugin/in_kube_services.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 5c5e92a6c..5dc2bfab8 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -2,209 +2,222 @@ # frozen_string_literal: true class ApplicationInsightsUtility - require_relative 'lib/application_insights' - require_relative 'omslog' - require_relative 'DockerApiClient' - require_relative 'oms_common' - require 'json' - require 'base64' + require_relative "lib/application_insights" + require_relative "omslog" + require_relative "DockerApiClient" + require_relative "oms_common" + require "json" + require "base64" - @@HeartBeat = 'HeartBeatEvent' - @@Exception = 'ExceptionEvent' - @@AcsClusterType = 'ACS' - @@AksClusterType = 'AKS' - @OmsAdminFilePath = '/etc/opt/microsoft/omsagent/conf/omsadmin.conf' - @@EnvAcsResourceName = 'ACS_RESOURCE_NAME' - @@EnvAksRegion = 'AKS_REGION' - @@EnvAgentVersion = 'AGENT_VERSION' - @@EnvApplicationInsightsKey = 'APPLICATIONINSIGHTS_AUTH' - @@EnvControllerType = 'CONTROLLER_TYPE' + @@HeartBeat = "HeartBeatEvent" + @@Exception = "ExceptionEvent" + @@AcsClusterType = "ACS" + @@AksClusterType = "AKS" + @OmsAdminFilePath = "/etc/opt/microsoft/omsagent/conf/omsadmin.conf" + @@EnvAcsResourceName = "ACS_RESOURCE_NAME" + @@EnvAksRegion = "AKS_REGION" + @@EnvAgentVersion = "AGENT_VERSION" + @@EnvApplicationInsightsKey = "APPLICATIONINSIGHTS_AUTH" + @@EnvControllerType = "CONTROLLER_TYPE" - @@CustomProperties = {} - @@Tc = nil - @@hostName = (OMS::Common.get_hostname) + @@CustomProperties = {} + @@Tc = nil + @@hostName = (OMS::Common.get_hostname) - def initialize - end + def initialize + end - class << self - #Set default properties for telemetry event - def initializeUtility() - begin - resourceInfo = ENV['AKS_RESOURCE_ID'] - if resourceInfo.nil? || resourceInfo.empty? - @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName] - @@CustomProperties["ClusterType"] = @@AcsClusterType - @@CustomProperties["SubscriptionID"] = "" - @@CustomProperties["ResourceGroupName"] = "" - @@CustomProperties["ClusterName"] = "" - @@CustomProperties["Region"] = "" - else - @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo - begin - splitStrings = resourceInfo.split('/') - subscriptionId = splitStrings[2] - resourceGroupName = splitStrings[4] - clusterName = splitStrings[8] - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}") - end - @@CustomProperties["ClusterType"] = @@AksClusterType - @@CustomProperties["SubscriptionID"] = subscriptionId - @@CustomProperties["ResourceGroupName"] = resourceGroupName - @@CustomProperties["ClusterName"] = clusterName - @@CustomProperties["Region"] = ENV[@@EnvAksRegion] - end + class << self + #Set default properties for telemetry event + def initializeUtility() + begin + resourceInfo = ENV["AKS_RESOURCE_ID"] + if resourceInfo.nil? || resourceInfo.empty? + @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName] + @@CustomProperties["ClusterType"] = @@AcsClusterType + @@CustomProperties["SubscriptionID"] = "" + @@CustomProperties["ResourceGroupName"] = "" + @@CustomProperties["ClusterName"] = "" + @@CustomProperties["Region"] = "" + else + @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo + begin + splitStrings = resourceInfo.split("/") + subscriptionId = splitStrings[2] + resourceGroupName = splitStrings[4] + clusterName = splitStrings[8] + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}") + end + @@CustomProperties["ClusterType"] = @@AksClusterType + @@CustomProperties["SubscriptionID"] = subscriptionId + @@CustomProperties["ResourceGroupName"] = resourceGroupName + @@CustomProperties["ClusterName"] = clusterName + @@CustomProperties["Region"] = ENV[@@EnvAksRegion] + end - getDockerInfo() - @@CustomProperties['WorkspaceID'] = getWorkspaceId - @@CustomProperties['AgentVersion'] = ENV[@@EnvAgentVersion] - @@CustomProperties['ControllerType'] = ENV[@@EnvControllerType] - encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] + #Commenting it for now from initilize method, we need to pivot all telemetry off of kubenode docker version + #getDockerInfo() + @@CustomProperties["WorkspaceID"] = getWorkspaceId + @@CustomProperties["AgentVersion"] = ENV[@@EnvAgentVersion] + @@CustomProperties["ControllerType"] = ENV[@@EnvControllerType] + encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] - #Check if telemetry is turned off - telemetryOffSwitch = ENV['DISABLE_TELEMETRY'] - if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase - $log.warn("AppInsightsUtility: Telemetry is disabled") - @@Tc = ApplicationInsights::TelemetryClient.new - elsif !encodedAppInsightsKey.nil? - decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) - @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey - - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") - end + #Check if telemetry is turned off + telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] + if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase + $log.warn("AppInsightsUtility: Telemetry is disabled") + @@Tc = ApplicationInsights::TelemetryClient.new + elsif !encodedAppInsightsKey.nil? + decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) + @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") + end + end - def getDockerInfo() - dockerInfo = DockerApiClient.dockerInfo - if (!dockerInfo.nil? && !dockerInfo.empty?) - @@CustomProperties['DockerVersion'] = dockerInfo['Version'] - @@CustomProperties['DockerApiVersion'] = dockerInfo['ApiVersion'] - end - end + def getDockerInfo() + dockerInfo = DockerApiClient.dockerInfo + if (!dockerInfo.nil? && !dockerInfo.empty?) + @@CustomProperties["DockerVersion"] = dockerInfo["Version"] + #@@CustomProperties["DockerApiVersion"] = dockerInfo["ApiVersion"] + end + end - def sendHeartBeatEvent(pluginName) - begin - eventName = pluginName + @@HeartBeat - if !(@@Tc.nil?) - @@Tc.track_event eventName , :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Heartbeat Telemetry sent successfully") - end - rescue =>errorStr - $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") - end + def sendHeartBeatEvent(pluginName) + begin + eventName = pluginName + @@HeartBeat + if !(@@Tc.nil?) + @@Tc.track_event eventName, :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Heartbeat Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") + end + end - def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) - begin - if !(@@Tc.nil?) - @@Tc.track_metric 'LastProcessedContainerInventoryCount', properties['ContainerCount'], - :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, - :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Container Count Telemetry sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") - end + def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) + begin + if !(@@Tc.nil?) + @@Tc.track_metric "LastProcessedContainerInventoryCount", properties["ContainerCount"], + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Container Count Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") + end + end - def sendCustomEvent(eventName, properties) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - end - if !(@@Tc.nil?) - @@Tc.track_event eventName, :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Custom Event #{eventName} sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") - end + def sendCustomEvent(eventName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_event eventName, :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights Custom Event #{eventName} sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") + end + end - def sendExceptionTelemetry(errorStr) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - if !(@@Tc.nil?) - @@Tc.track_exception errorStr , :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Exception Telemetry sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") - end + def sendExceptionTelemetry(errorStr, properties = nil) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() + end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_exception errorStr, :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights Exception Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") + end + end - #Method to send heartbeat and container inventory count - def sendTelemetry(pluginName, properties) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - @@CustomProperties['Computer'] = properties['Computer'] - sendHeartBeatEvent(pluginName) - sendLastProcessedContainerInventoryCountMetric(pluginName, properties) - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") - end + #Method to send heartbeat and container inventory count + def sendTelemetry(pluginName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() end + @@CustomProperties["Computer"] = properties["Computer"] + sendHeartBeatEvent(pluginName) + sendLastProcessedContainerInventoryCountMetric(pluginName, properties) + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") + end + end - #Method to send metric. It will merge passed-in properties with common custom properties - def sendMetricTelemetry(metricName, metricValue, properties) - begin - if (metricName.empty? || metricName.nil?) - $log.warn("SendMetricTelemetry: metricName is missing") - return - end - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - telemetryProps = {} - telemetryProps["Computer"] = @@hostName - # add common dimensions - @@CustomProperties.each{ |k,v| telemetryProps[k]=v} - # add passed-in dimensions if any - if (!properties.nil? && !properties.empty?) - properties.each{ |k,v| telemetryProps[k]=v} - end - if !(@@Tc.nil?) - @@Tc.track_metric metricName, metricValue, - :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, - :properties => telemetryProps - @@Tc.flush - $log.info("AppInsights metric Telemetry #{metricName} sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") - end + #Method to send metric. It will merge passed-in properties with common custom properties + def sendMetricTelemetry(metricName, metricValue, properties) + begin + if (metricName.empty? || metricName.nil?) + $log.warn("SendMetricTelemetry: metricName is missing") + return end + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() + end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_metric metricName, metricValue, + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights metric Telemetry #{metricName} sent successfully") + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") + end + end - def getWorkspaceId() - begin - adminConf = {} - confFile = File.open(@OmsAdminFilePath, "r") - confFile.each_line do |line| - splitStrings = line.split('=') - adminConf[splitStrings[0]] = splitStrings[1] - end - workspaceId = adminConf['WORKSPACE_ID'] - return workspaceId - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") - end + def getWorkspaceId() + begin + adminConf = {} + confFile = File.open(@OmsAdminFilePath, "r") + confFile.each_line do |line| + splitStrings = line.split("=") + adminConf[splitStrings[0]] = splitStrings[1] end + workspaceId = adminConf["WORKSPACE_ID"] + return workspaceId + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") + end end -end \ No newline at end of file + end +end diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 3c36775af..8b4fd9fcf 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -2,424 +2,628 @@ # frozen_string_literal: true class CAdvisorMetricsAPIClient - - require 'json' - require 'logger' - require 'net/http' - require 'net/https' - require 'uri' - require 'date' - - require_relative 'oms_common' - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" - @Log = Logger.new(@LogPath, 2, 10*1048576) #keep last 2 files, max log file size = 10M - @@rxBytesLast = nil - @@rxBytesTimeLast = nil - @@txBytesLast = nil - @@txBytesTimeLast = nil - @@nodeCpuUsageNanoSecondsLast = nil - @@nodeCpuUsageNanoSecondsTimeLast = nil - @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - - - def initialize + require "json" + require "logger" + require "net/http" + require "net/https" + require "uri" + require "date" + + require_relative "oms_common" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + # @@rxBytesLast = nil + # @@rxBytesTimeLast = nil + # @@txBytesLast = nil + # @@txBytesTimeLast = nil + @@nodeCpuUsageNanoSecondsLast = nil + @@nodeCpuUsageNanoSecondsTimeLast = nil + @@winNodeCpuUsageNanoSecondsLast = {} + @@winNodeCpuUsageNanoSecondsTimeLast = {} + @@winContainerCpuUsageNanoSecondsLast = {} + @@winContainerCpuUsageNanoSecondsTimeLast = {} + @@winContainerPrevMetricRate = {} + @@linuxNodePrevMetricRate = nil + @@winNodePrevMetricRate = {} + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + + #Containers a hash of node name and the last time telemetry was sent for this node + @@nodeTelemetryTimeTracker = {} + + # Keeping track of containers so that can delete the container from the container cpu cache when the container is deleted + # as a part of the cleanup routine + @@winContainerIdCache = [] + + def initialize + end + + class << self + def getSummaryStatsFromCAdvisor(winNode) + headers = {} + response = nil + @Log.info "Getting CAdvisor Uri" + begin + cAdvisorUri = getCAdvisorUri(winNode) + if !cAdvisorUri.nil? + uri = URI.parse(cAdvisorUri) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = false + + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end + rescue => error + @Log.warn("CAdvisor api request failed: #{error}") + telemetryProps = {} + telemetryProps["Computer"] = winNode["Hostname"] + ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) + end + return response + end + + def getCAdvisorUri(winNode) + begin + defaultHost = "http://localhost:10255" + relativeUri = "/stats/summary" + if !winNode.nil? + nodeIP = winNode["InternalIP"] + else + nodeIP = ENV["NODE_IP"] + end + if !nodeIP.nil? + @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") + return "http://#{nodeIP}:10255" + relativeUri + else + @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") + if !winNode.nil? + return nil + else + return defaultHost + relativeUri + end + end + end + end + + def getMetrics(winNode = nil) + metricDataItems = [] + begin + if !winNode.nil? + hostName = winNode["Hostname"] + operatingSystem = "Windows" + else + hostName = (OMS::Common.get_hostname) + operatingSystem = "Linux" + end + cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !cAdvisorStats.nil? + metricInfo = JSON.parse(cAdvisorStats.body) + end + if !metricInfo.nil? + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) + metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) + + if operatingSystem == "Linux" + metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) + elsif operatingSystem == "Windows" + containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", "cpuUsageNanoCores") + if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? + metricDataItems.concat(containerCpuUsageNanoSecondsRate) end - - class << self - def getSummaryStatsFromCAdvisor() - headers = {} - response = nil - @Log.info 'Getting CAdvisor Uri' - begin - cAdvisorUri = getCAdvisorUri() - if !cAdvisorUri.nil? - uri = URI.parse(cAdvisorUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = false - - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" - end - rescue => error - @Log.warn("CAdvisor api request failed: #{error}") - end - return response - end - - def getCAdvisorUri() - begin - defaultHost = "http://localhost:10255" - relativeUri = "/stats/summary" - nodeIP = ENV['NODE_IP'] - if !nodeIP.nil? - @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") - return "http://#{nodeIP}:10255" + relativeUri - else - @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") - return defaultHost + relativeUri - end - end - end - - def getMetrics() - metricDataItems = [] - begin - hostName = (OMS::Common.get_hostname) - metricInfo = JSON.parse(getSummaryStatsFromCAdvisor().body) - metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores","cpuUsageNanoCores")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) - metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) - - cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores") - if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? - metricDataItems.push(cpuUsageNanoSecondsRate) - end - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) - metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) - - networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") - if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? - metricDataItems.push(networkRxRate) - end - networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") - if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? - metricDataItems.push(networkTxRate) - end - - - rescue => error - @Log.warn("getContainerMetrics failed: #{error}") - return metricDataItems - end - return metricDataItems - end + end - def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - podName = pod['podRef']['name'] - podNamespace = pod['podRef']['namespace'] - - if (!pod['containers'].nil?) - pod['containers'].each do |container| - #cpu metric - containerName = container['name'] - metricValue = container['cpu'][cpuMetricNameToCollect] - metricTime = container['cpu']['time'] - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #Telemetry about agent performance - begin - # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers - # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use - if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("cpuUsageNanoCores")) - - if (timeDifferenceInMinutes >= 10) - telemetryProps = {} - telemetryProps['PodName'] = podName - telemetryProps['ContainerName'] = containerName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") - end - end - end - end - # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) - if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) - @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i - end - rescue => error - @Log.warn("getcontainerCpuMetricItems failed: #{error} for metric #{cpuMetricNameToCollect}") - return metricItems - end - return metricItems - end + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores", operatingSystem) + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) - def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - podName = pod['podRef']['name'] - podNamespace = pod['podRef']['namespace'] - if (!pod['containers'].nil?) - pod['containers'].each do |container| - containerName = container['name'] - metricValue = container['memory'][memoryMetricNameToCollect] - metricTime = container['memory']['time'] - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #Telemetry about agent performance - begin - # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers - # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use - if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("memoryRssBytes")) - if (timeDifferenceInMinutes >= 10) - telemetryProps = {} - telemetryProps['PodName'] = podName - telemetryProps['ContainerName'] = containerName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getcontainerMemoryMetricItems failed: #{errorStr} for metric #{memoryMetricNameToCollect}") - end - end - end - end - # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) - if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("memoryRssBytes")) - @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - end - rescue => error - @Log.warn("getcontainerMemoryMetricItems failed: #{error} for metric #{memoryMetricNameToCollect}") - @Log.warn metricJSON - return metricItems - end - return metricItems - end + metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) + + # Disabling networkRxRate and networkTxRate since we dont use it as of now. + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) + # networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") + # if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? + # metricDataItems.push(networkRxRate) + # end + # networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") + # if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? + # metricDataItems.push(networkTxRate) + # end + else + @Log.warn("Couldn't get metric information for host: #{hostName}") + end + rescue => error + @Log.warn("getContainerMetrics failed: #{error}") + return metricDataItems + end + return metricDataItems + end + + def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #cpu metric + containerName = container["name"] + metricValue = container["cpu"][cpuMetricNameToCollect] + metricTime = container["cpu"]["time"] + metricItem = {} + metricItem["DataItems"] = [] - def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - begin - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - - metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]['time'] - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") - @Log.warn metricJSON - return metricItem - end - return metricItem + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("cpuUsageNanoCores")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + telemetryProps["Computer"] = hostName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") + end + end + end + end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + end + rescue => error + @Log.warn("getcontainerCpuMetricItems failed: #{error} for metric #{cpuMetricNameToCollect}") + return metricItems + end + return metricItems + end + + def clearDeletedWinContainersFromCache() + begin + winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys + winCpuUsageNanoSecondsTimeKeys = @@winContainerCpuUsageNanoSecondsTimeLast.keys + + # Find the container ids to be deleted from cache + winContainersToBeCleared = winCpuUsageNanoSecondsKeys - @@winContainerIdCache + if winContainersToBeCleared.length > 0 + @Log.warn "Stale containers found in cache, clearing...: #{winContainersToBeCleared}" + end + winContainersToBeCleared.each do |containerId| + @@winContainerCpuUsageNanoSecondsLast.delete(containerId) + @@winContainerCpuUsageNanoSecondsTimeLast.delete(containerId) + end + rescue => errorStr + @Log.warn("clearDeletedWinContainersFromCache failed: #{errorStr}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def resetWinContainerIdCache + @@winContainerIdCache = [] + end + + # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds + def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + @Log.warn "in host: #{hostName}" + begin + metricInfo = metricJSON + containerCount = 0 + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #cpu metric + containerCount += 1 + containerName = container["name"] + metricValue = container["cpu"][cpuMetricNameToCollect] + metricTime = container["cpu"]["time"] + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn - def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - begin - - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]['time'] - - if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds" ) - @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") - return nil - elsif metricNameToCollect == "rxBytes" - if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@rxBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - metricValue = metricRateValue - end - elsif metricNameToCollect == "txBytes" - if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@txBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - metricValue = metricRateValue - end - else - if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - metricValue = metricRateValue - end - end - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") - @Log.warn metricJSON - return nil - end - return metricItem + containerId = podUid + "/" + containerName + # Adding the containers to the winContainerIdCache so that it can be used by the cleanup routine + # to clear the delted containers every 5 minutes + @@winContainerIdCache.push(containerId) + if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true + @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue + @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime + next + else + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time + containerCpuUsageDifference = metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId] + # containerCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && containerCpuUsageDifference != 0 + metricRateValue = (containerCpuUsageDifference * 1.0) / timeDifference + else + @Log.info "container - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@winContainerPrevMetricRate[containerId].nil? + metricRateValue = @@winContainerPrevMetricRate[containerId] + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end end + @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue + @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime + metricValue = metricRateValue + @@winContainerPrevMetricRate[containerId] = metricRateValue + end - def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - - begin - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - - metricValue = node['startTime'] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - #Read it from /proc/uptime - metricCollections['Value'] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") - @Log.warn metricJSON - return metricItem - end - return metricItem + metricCollections["Value"] = metricValue + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + end + end + end + #Sending ContainerInventoryTelemetry from replicaset for telemetry purposes + if @@nodeTelemetryTimeTracker[hostName].nil? + @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i + else + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker[hostName]).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties["Computer"] = hostName + telemetryProperties["ContainerCount"] = containerCount + # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. + @Log.info "sending container inventory heartbeat telemetry" + ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) + end + end + rescue => error + @Log.warn("getcontainerCpuMetricItemRate failed: #{error} for metric #{cpuMetricNameToCollect}") + return metricItems + end + return metricItems + end + + def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + metricValue = container["memory"][memoryMetricNameToCollect] + metricTime = container["memory"]["time"] + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("memoryRssBytes")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + telemetryProps["Computer"] = hostName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerMemoryMetricItems failed: #{errorStr} for metric #{memoryMetricNameToCollect}") + end + end + end + end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("memoryRssBytes")) + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + end + rescue => error + @Log.warn("getcontainerMemoryMetricItems failed: #{error} for metric #{memoryMetricNameToCollect}") + @Log.warn metricJSON + return metricItems + end + return metricItems + end + + def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] + + if !node[metricCategory].nil? + metricValue = node[metricCategory][metricNameToCollect] + metricTime = node[metricCategory]["time"] + + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + end + rescue => error + @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") + @Log.warn metricJSON + return metricItem + end + return metricItem + end + + def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn, operatingSystem) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] - def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - if (!pod['containers'].nil?) - pod['containers'].each do |container| - containerName = container['name'] - metricValue = container['startTime'] - metricTime = currentTime - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = DateTime.parse(metricValue).to_time.to_i - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - end - end - end - rescue => error - @Log.warn("getContainerStartTimeMetric failed: #{error} for metric #{metricNametoReturn}") - @Log.warn metricJSON - return metricItems - end - return metricItems + if !node[metricCategory].nil? + metricValue = node[metricCategory][metricNameToCollect] + metricTime = node[metricCategory]["time"] + + # if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds") + # @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") + if !(metricNameToCollect == "usageCoreNanoSeconds") + @Log.warn("getNodeMetricItemRate : rateMetric is supported only for usageCoreNanoSeconds and not for #{metricNameToCollect}") + return nil + # elsif metricNameToCollect == "rxBytes" + # if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true + # @@rxBytesLast = metricValue + # @@rxBytesTimeLast = metricTime + # return nil + # else + # metricRateValue = ((metricValue - @@rxBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) + # @@rxBytesLast = metricValue + # @@rxBytesTimeLast = metricTime + # metricValue = metricRateValue + # end + # elsif metricNameToCollect == "txBytes" + # if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true + # @@txBytesLast = metricValue + # @@txBytesTimeLast = metricTime + # return nil + # else + # metricRateValue = ((metricValue - @@txBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) + # @@txBytesLast = metricValue + # @@txBytesTimeLast = metricTime + # metricValue = metricRateValue + # end + else + if operatingSystem == "Linux" + if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + return nil + else + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time + nodeCpuUsageDifference = metricValue - @@nodeCpuUsageNanoSecondsLast + # nodeCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && nodeCpuUsageDifference != 0 + metricRateValue = (nodeCpuUsageDifference * 1.0) / timeDifference + else + @Log.info "linux node - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@linuxNodePrevMetricRate.nil? + metricRateValue = @@linuxNodePrevMetricRate + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end + end + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + @@linuxNodePrevMetricRate = metricRateValue + metricValue = metricRateValue + end + elsif operatingSystem == "Windows" + # Using the hash for windows nodes since this is running in replica set and there can be multiple nodes + if @@winNodeCpuUsageNanoSecondsLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsTimeLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsLast[hostName] > metricValue #when kubelet is restarted the last condition will be true + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + return nil + else + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time + nodeCpuUsageDifference = metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName] + # nodeCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && nodeCpuUsageDifference != 0 + metricRateValue = (nodeCpuUsageDifference * 1.0) / timeDifference + else + @Log.info "windows node - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@winNodePrevMetricRate[hostName].nil? + metricRateValue = @@winNodePrevMetricRate[hostName] + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end end + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + @@winNodePrevMetricRate[hostName] = metricRateValue + metricValue = metricRateValue + end + end + end + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + end + rescue => error + @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") + @Log.warn metricJSON + return nil + end + return metricItem + end + + def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] + + metricValue = node["startTime"] + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + #Read it from /proc/uptime + metricCollections["Value"] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + rescue => error + @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") + @Log.warn metricJSON + return metricItem + end + return metricItem + end + + def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + metricValue = container["startTime"] + metricTime = currentTime + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = DateTime.parse(metricValue).to_time.to_i + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) end + end end + rescue => error + @Log.warn("getContainerStartTimeMetric failed: #{error} for metric #{metricNametoReturn}") + @Log.warn metricJSON + return metricItems + end + return metricItems + end + end +end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index a1e143b15..4ed85025f 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -2,474 +2,516 @@ # frozen_string_literal: true class KubernetesApiClient + require "json" + require "logger" + require "net/http" + require "net/https" + require "uri" + require "time" - require 'json' - require 'logger' - require 'net/http' - require 'net/https' - require 'uri' - require 'time' - - require_relative 'oms_common' - - @@ApiVersion = "v1" - @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - @@ClusterName = nil - @@ClusterId = nil - @@IsNodeMaster = nil - #@@IsValidRunningNode = nil - #@@IsLinuxCluster = nil - @@KubeSystemNamespace = "kube-system" - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" - @Log = Logger.new(@LogPath, 2, 10*1048576) #keep last 2 files, max log file size = 10M - @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" - @@TokenStr = nil - @@NodeMetrics = Hash.new - - def initialize + require_relative "oms_common" + + @@ApiVersion = "v1" + @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + @@ClusterName = nil + @@ClusterId = nil + @@IsNodeMaster = nil + #@@IsValidRunningNode = nil + #@@IsLinuxCluster = nil + @@KubeSystemNamespace = "kube-system" + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" + @@TokenStr = nil + @@NodeMetrics = Hash.new + @@WinNodeArray = [] + + def initialize + end + + class << self + def getKubeResourceInfo(resource) + headers = {} + response = nil + @Log.info "Getting Kube resource" + @Log.info resource + begin + resourceUri = getResourceUri(resource) + if !resourceUri.nil? + uri = URI.parse(resourceUri) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + if !File.exist?(@@CaFile) + raise "#{@@CaFile} doesnt exist" + else + http.ca_file = @@CaFile if File.exist?(@@CaFile) + end + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + + kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) + kubeApiRequest["Authorization"] = "Bearer " + getTokenStr + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" + response = http.request(kubeApiRequest) + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" end + rescue => error + @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") + end + if (response.body.empty?) + @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") + end + return response + end - class << self - def getKubeResourceInfo(resource) - headers = {} - response = nil - @Log.info 'Getting Kube resource' - @Log.info resource - begin - resourceUri = getResourceUri(resource) - if !resourceUri.nil? - uri = URI.parse(resourceUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = true - if !File.exist?(@@CaFile) - raise "#{@@CaFile} doesnt exist" - else - http.ca_file = @@CaFile if File.exist?(@@CaFile) - end - http.verify_mode = OpenSSL::SSL::VERIFY_PEER - - kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) - kubeApiRequest['Authorization'] = "Bearer " + getTokenStr - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" - response = http.request(kubeApiRequest) - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" - end - rescue => error - @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") - end - if (response.body.empty?) - @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") - end - return response - end + def getTokenStr + return @@TokenStr if !@@TokenStr.nil? + begin + if File.exist?(@@TokenFileName) && File.readable?(@@TokenFileName) + @@TokenStr = File.read(@@TokenFileName).strip + return @@TokenStr + else + @Log.warn("Unable to read token string from #{@@TokenFileName}: #{error}") + return nil + end + end + end - def getTokenStr - return @@TokenStr if !@@TokenStr.nil? - begin - if File.exist?(@@TokenFileName) && File.readable?(@@TokenFileName) - @@TokenStr = File.read(@@TokenFileName).strip - return @@TokenStr - else - @Log.warn("Unable to read token string from #{@@TokenFileName}: #{error}") - return nil - end - end - end + def getResourceUri(resource) + begin + if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource + else + @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") + return nil + end + end + end - def getResourceUri(resource) - begin - if ENV['KUBERNETES_SERVICE_HOST'] && ENV['KUBERNETES_PORT_443_TCP_PORT'] - return "https://#{ENV['KUBERNETES_SERVICE_HOST']}:#{ENV['KUBERNETES_PORT_443_TCP_PORT']}/api/" + @@ApiVersion + "/" + resource - else - @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV['KUBERNETES_SERVICE_HOST']} KUBERNETES_PORT_443_TCP_PORT: #{ENV['KUBERNETES_PORT_443_TCP_PORT']}. Unable to form resourceUri") - return nil - end + def getClusterName + return @@ClusterName if !@@ClusterName.nil? + @@ClusterName = "None" + begin + #try getting resource ID for aks + cluster = ENV["AKS_RESOURCE_ID"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterName = cluster.split("/").last + else + cluster = ENV["ACS_RESOURCE_NAME"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterName = cluster + else + kubesystemResourceUri = "namespaces/" + @@KubeSystemNamespace + "/pods" + @Log.info("KubernetesApiClient::getClusterName : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + @Log.info("KubernetesApiClient::getClusterName : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo["items"].each do |items| + if items["metadata"]["name"].include? "kube-controller-manager" + items["spec"]["containers"][0]["command"].each do |command| + if command.include? "--cluster-name" + @@ClusterName = command.split("=")[1] + end end + end end + end + end + rescue => error + @Log.warn("getClusterName failed: #{error}") + end + return @@ClusterName + end - def getClusterName - return @@ClusterName if !@@ClusterName.nil? - @@ClusterName = "None" - begin - #try getting resource ID for aks - cluster = ENV['AKS_RESOURCE_ID'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterName = cluster.split("/").last - else - cluster = ENV['ACS_RESOURCE_NAME'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterName = cluster - else - kubesystemResourceUri = "namespaces/" + @@KubeSystemNamespace + "/pods" - @Log.info("KubernetesApiClient::getClusterName : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - @Log.info("KubernetesApiClient::getClusterName : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo['items'].each do |items| - if items['metadata']['name'].include? "kube-controller-manager" - items['spec']['containers'][0]['command'].each do |command| - if command.include? "--cluster-name" - @@ClusterName = command.split('=')[1] - end - end - end - end - end - end - rescue => error - @Log.warn("getClusterName failed: #{error}") - end - return @@ClusterName - end + def getClusterId + return @@ClusterId if !@@ClusterId.nil? + #By default initialize ClusterId to ClusterName. + # In ACS/On-prem, we need to figure out how we can generate ClusterId + @@ClusterId = getClusterName + begin + cluster = ENV["AKS_RESOURCE_ID"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterId = cluster + end + rescue => error + @Log.warn("getClusterId failed: #{error}") + end + return @@ClusterId + end - def getClusterId - return @@ClusterId if !@@ClusterId.nil? - #By default initialize ClusterId to ClusterName. - # In ACS/On-prem, we need to figure out how we can generate ClusterId - @@ClusterId = getClusterName - begin - cluster = ENV['AKS_RESOURCE_ID'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterId = cluster - end - rescue => error - @Log.warn("getClusterId failed: #{error}") - end - return @@ClusterId + def isNodeMaster + return @@IsNodeMaster if !@@IsNodeMaster.nil? + @@IsNodeMaster = false + begin + @Log.info("KubernetesApiClient::isNodeMaster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + allNodesInfo = JSON.parse(getKubeResourceInfo("nodes").body) + @Log.info("KubernetesApiClient::isNodeMaster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + if !allNodesInfo.nil? && !allNodesInfo.empty? + thisNodeName = OMS::Common.get_hostname + allNodesInfo["items"].each do |item| + if item["metadata"]["name"].casecmp(thisNodeName) == 0 + if item["metadata"]["labels"]["kubernetes.io/role"].to_s.include?("master") || item["metadata"]["labels"]["role"].to_s.include?("master") + @@IsNodeMaster = true + end + break end + end + end + rescue => error + @Log.warn("KubernetesApiClient::isNodeMaster : node role request failed: #{error}") + end - def isNodeMaster - return @@IsNodeMaster if !@@IsNodeMaster.nil? - @@IsNodeMaster = false - begin - @Log.info("KubernetesApiClient::isNodeMaster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) - @Log.info("KubernetesApiClient::isNodeMaster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - if !allNodesInfo.nil? && !allNodesInfo.empty? - thisNodeName = OMS::Common.get_hostname - allNodesInfo['items'].each do |item| - if item['metadata']['name'].casecmp(thisNodeName) == 0 - if item['metadata']['labels']["kubernetes.io/role"].to_s.include?("master") || item['metadata']['labels']["role"].to_s.include?("master") - @@IsNodeMaster = true - end - break - end - end - end - rescue => error - @Log.warn("KubernetesApiClient::isNodeMaster : node role request failed: #{error}") - end - - return @@IsNodeMaster - end + return @@IsNodeMaster + end - #def isValidRunningNode - # return @@IsValidRunningNode if !@@IsValidRunningNode.nil? - # @@IsValidRunningNode = false - # begin - # thisNodeName = OMS::Common.get_hostname - # if isLinuxCluster - # # Run on agent node [0] - # @@IsValidRunningNode = !isNodeMaster && thisNodeName.to_s.split('-').last == '0' - # else - # # Run on master node [0] - # @@IsValidRunningNode = isNodeMaster && thisNodeName.to_s.split('-').last == '0' - # end - # rescue => error - # @Log.warn("Checking Node Type failed: #{error}") - # end - # if(@@IsValidRunningNode == true) - # @Log.info("Electing current node to talk to k8 api") - # else - # @Log.info("Not Electing current node to talk to k8 api") - # end - # return @@IsValidRunningNode - #end - - #def isLinuxCluster - # return @@IsLinuxCluster if !@@IsLinuxCluster.nil? - # @@IsLinuxCluster = true - # begin - # @Log.info("KubernetesApiClient::isLinuxCluster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) - # @Log.info("KubernetesApiClient::isLinuxCluster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # if !allNodesInfo.nil? && !allNodesInfo.empty? - # allNodesInfo['items'].each do |item| - # if !(item['status']['nodeInfo']['operatingSystem'].casecmp('linux') == 0) - # @@IsLinuxCluster = false - # break - # end - # end - # end - # rescue => error - # @Log.warn("KubernetesApiClient::isLinuxCluster : node role request failed: #{error}") - # end - # return @@IsLinuxCluster - #end - - # returns an arry of pods (json) - def getPods(namespace) - pods = [] - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods" - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - podInfo['items'].each do |items| - pods.push items - end - rescue => error - @Log.warn("List pods request failed: #{error}") - end - return pods - end + #def isValidRunningNode + # return @@IsValidRunningNode if !@@IsValidRunningNode.nil? + # @@IsValidRunningNode = false + # begin + # thisNodeName = OMS::Common.get_hostname + # if isLinuxCluster + # # Run on agent node [0] + # @@IsValidRunningNode = !isNodeMaster && thisNodeName.to_s.split('-').last == '0' + # else + # # Run on master node [0] + # @@IsValidRunningNode = isNodeMaster && thisNodeName.to_s.split('-').last == '0' + # end + # rescue => error + # @Log.warn("Checking Node Type failed: #{error}") + # end + # if(@@IsValidRunningNode == true) + # @Log.info("Electing current node to talk to k8 api") + # else + # @Log.info("Not Electing current node to talk to k8 api") + # end + # return @@IsValidRunningNode + #end + + #def isLinuxCluster + # return @@IsLinuxCluster if !@@IsLinuxCluster.nil? + # @@IsLinuxCluster = true + # begin + # @Log.info("KubernetesApiClient::isLinuxCluster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + # allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) + # @Log.info("KubernetesApiClient::isLinuxCluster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + # if !allNodesInfo.nil? && !allNodesInfo.empty? + # allNodesInfo['items'].each do |item| + # if !(item['status']['nodeInfo']['operatingSystem'].casecmp('linux') == 0) + # @@IsLinuxCluster = false + # break + # end + # end + # end + # rescue => error + # @Log.warn("KubernetesApiClient::isLinuxCluster : node role request failed: #{error}") + # end + # return @@IsLinuxCluster + #end + + # returns an arry of pods (json) + def getPods(namespace) + pods = [] + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods" + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + podInfo["items"].each do |items| + pods.push items + end + rescue => error + @Log.warn("List pods request failed: #{error}") + end + return pods + end - def getContainerIDs(namespace) - containers = Hash.new - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods" - @Log.info("KubernetesApiClient::getContainerIDs : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - @Log.info("KubernetesApiClient::getContainerIDs : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo['items'].each do |item| - if (!item['status'].nil? && !item['status'].empty? && !item['status']['containerStatuses'].nil? && !item['status']['containerStatuses'].empty?) - item['status']['containerStatuses'].each do |cntr| - containers[cntr['containerID']] = "kube-system" - end - end - end - rescue => error - @Log.warn("KubernetesApiClient::getContainerIDs : List ContainerIDs request failed: #{error}") + # returns a hash of windows node names and their internal IPs + def getWindowsNodes + winNodes = [] + begin + nodeInventory = JSON.parse(getKubeResourceInfo("nodes").body) + @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" + # Resetting the windows node cache + @@WinNodeArray.clear + if (!nodeInventory.empty?) + nodeInventory["items"].each do |item| + # check for windows operating system in node metadata + winNode = {} + nodeStatus = item["status"] + nodeMetadata = item["metadata"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data + # to get images and image tags for containers in windows nodes + if !nodeMetadata.nil? && !nodeMetadata["name"].nil? + @@WinNodeArray.push(nodeMetadata["name"]) end - return containers + nodeStatusAddresses = nodeStatus["addresses"] + if !nodeStatusAddresses.nil? + nodeStatusAddresses.each do |address| + winNode[address["type"]] = address["address"] + end + winNodes.push(winNode) + end + end end + end + end + return winNodes + rescue => error + @Log.warn("Error in get windows nodes: #{error}") + return nil + end + end - def getContainerLogs(namespace, pod, container, showTimeStamp) - containerLogs = "" - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container - if showTimeStamp - kubesystemResourceUri += "×tamps=true" - end - @Log.info("KubernetesApiClient::getContainerLogs : Getting logs from Kube API @ #{Time.now.utc.iso8601}") - containerLogs = getKubeResourceInfo(kubesystemResourceUri).body - @Log.info("KubernetesApiClient::getContainerLogs : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") - rescue => error - @Log.warn("Pod logs request failed: #{error}") - end - return containerLogs + def getWindowsNodesArray + return @@WinNodeArray + end + + def getContainerIDs(namespace) + containers = Hash.new + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods" + @Log.info("KubernetesApiClient::getContainerIDs : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + @Log.info("KubernetesApiClient::getContainerIDs : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo["items"].each do |item| + if (!item["status"].nil? && !item["status"].empty? && !item["status"]["containerStatuses"].nil? && !item["status"]["containerStatuses"].empty?) + item["status"]["containerStatuses"].each do |cntr| + containers[cntr["containerID"]] = "kube-system" end + end + end + rescue => error + @Log.warn("KubernetesApiClient::getContainerIDs : List ContainerIDs request failed: #{error}") + end + return containers + end + + def getContainerLogs(namespace, pod, container, showTimeStamp) + containerLogs = "" + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + if showTimeStamp + kubesystemResourceUri += "×tamps=true" + end + @Log.info("KubernetesApiClient::getContainerLogs : Getting logs from Kube API @ #{Time.now.utc.iso8601}") + containerLogs = getKubeResourceInfo(kubesystemResourceUri).body + @Log.info("KubernetesApiClient::getContainerLogs : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") + rescue => error + @Log.warn("Pod logs request failed: #{error}") + end + return containerLogs + end + + def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp) + containerLogs = "" + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + "&sinceTime=" + since + kubesystemResourceUri = URI.escape(kubesystemResourceUri, ":.+") # HTML URL Encoding for date + + if showTimeStamp + kubesystemResourceUri += "×tamps=true" + end + @Log.info("calling #{kubesystemResourceUri}") + @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Getting logs from Kube API @ #{Time.now.utc.iso8601}") + containerLogs = getKubeResourceInfo(kubesystemResourceUri).body + @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") + rescue => error + @Log.warn("Pod logs request failed: #{error}") + end + return containerLogs + end - def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp) - containerLogs = "" - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + "&sinceTime=" + since - kubesystemResourceUri = URI.escape(kubesystemResourceUri, ":.+") # HTML URL Encoding for date - - if showTimeStamp - kubesystemResourceUri += "×tamps=true" - end - @Log.info("calling #{kubesystemResourceUri}") - @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Getting logs from Kube API @ #{Time.now.utc.iso8601}") - containerLogs = getKubeResourceInfo(kubesystemResourceUri).body - @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") - rescue => error - @Log.warn("Pod logs request failed: #{error}") + def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + metricItems = [] + begin + clusterId = getClusterId + metricInfo = metricJSON + metricInfo["items"].each do |pod| + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] + else + podUid = pod["metadata"]["uid"] + end + if (!pod["spec"]["containers"].nil? && !pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + pod["spec"]["containers"].each do |container| + containerName = container["name"] + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit + else + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) + metricValue = @@NodeMetrics[nodeMetricsHashKey] + #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) end - return containerLogs + end end + end + end + rescue => error + @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + return metricItems + end + return metricItems + end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) - metricItems = [] - begin - clusterId = getClusterId - metricInfo = metricJSON - metricInfo['items'].each do |pod| - podNameSpace = pod['metadata']['namespace'] - if podNameSpace.eql?("kube-system") && !pod['metadata'].key?("ownerReferences") - # The above case seems to be the only case where you have horizontal scaling of pods - # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash - # instead of the actual poduid. Since this uid is not being surface into the UX - # its ok to use this. - # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = pod['metadata']['annotations']['kubernetes.io/config.hash'] - else - podUid = pod['metadata']['uid'] - end - if (!pod['spec']['containers'].nil? && !pod['spec']['nodeName'].nil?) - nodeName = pod['spec']['nodeName'] - pod['spec']['containers'].each do |container| - containerName = container['name'] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container['resources'].nil? && !container['resources'].empty? && !container['resources'][metricCategory].nil? && !container['resources'][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container['resources'][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = nodeName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #No container level limit for the given metric, so default to node level limit - else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = nodeName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - end - end - end - end - end - rescue => error - @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") - return metricItems - end - return metricItems - end #getContainerResourceRequestAndLimits - - def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) - metricItems = [] - begin - metricInfo = metricJSON - clusterId = getClusterId - #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, - #if we are coming up with the time it should be same for all nodes - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricInfo['items'].each do |node| - if (!node['status'][metricCategory].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" - metricValue = getMetricNumericValue(metricNameToCollect, node['status'][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem['DataItems'] = [] - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = node['metadata']['name'] - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + node['metadata']['name'] - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node['metadata']['name'] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end - end - rescue => error - @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") - end - return metricItems - end #parseNodeLimits - - def getMetricNumericValue(metricName, metricVal) - metricValue = metricVal - begin - case metricName - when "memory" #convert to bytes for memory - #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ - if (metricValue.end_with?("Ki")) - metricValue.chomp!("Ki") - metricValue = Float(metricValue) * 1024.0 ** 1 - elsif (metricValue.end_with?("Mi")) - metricValue.chomp!("Mi") - metricValue = Float(metricValue) * 1024.0 ** 2 - elsif (metricValue.end_with?("Gi")) - metricValue.chomp!("Gi") - metricValue = Float(metricValue) * 1024.0 ** 3 - elsif (metricValue.end_with?("Ti")) - metricValue.chomp!("Ti") - metricValue = Float(metricValue) * 1024.0 ** 4 - elsif (metricValue.end_with?("Pi")) - metricValue.chomp!("Pi") - metricValue = Float(metricValue) * 1024.0 ** 5 - elsif (metricValue.end_with?("Ei")) - metricValue.chomp!("Ei") - metricValue = Float(metricValue) * 1024.0 ** 6 - elsif (metricValue.end_with?("Zi")) - metricValue.chomp!("Zi") - metricValue = Float(metricValue) * 1024.0 ** 7 - elsif (metricValue.end_with?("Yi")) - metricValue.chomp!("Yi") - metricValue = Float(metricValue) * 1024.0 ** 8 - elsif (metricValue.end_with?("K")) - metricValue.chomp!("K") - metricValue = Float(metricValue) * 1000.0 ** 1 - elsif (metricValue.end_with?("M")) - metricValue.chomp!("M") - metricValue = Float(metricValue) * 1000.0 ** 2 - elsif (metricValue.end_with?("G")) - metricValue.chomp!("G") - metricValue = Float(metricValue) * 1000.0 ** 3 - elsif (metricValue.end_with?("T")) - metricValue.chomp!("T") - metricValue = Float(metricValue) * 1000.0 ** 4 - elsif (metricValue.end_with?("P")) - metricValue.chomp!("P") - metricValue = Float(metricValue) * 1000.0 ** 5 - elsif (metricValue.end_with?("E")) - metricValue.chomp!("E") - metricValue = Float(metricValue) * 1000.0 ** 6 - elsif (metricValue.end_with?("Z")) - metricValue.chomp!("Z") - metricValue = Float(metricValue) * 1000.0 ** 7 - elsif (metricValue.end_with?("Y")) - metricValue.chomp!("Y") - metricValue = Float(metricValue) * 1000.0 ** 8 - else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') - metricValue = Float(metricValue) - end - when "cpu" #convert to nanocores for cpu - #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/ - if (metricValue.end_with?("m")) - metricValue.chomp!("m") - metricValue = Float(metricValue) * 1000.0 ** 2 - else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') - metricValue = Float(metricValue) * 1000.0 ** 3 - end - else - @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") - metricValue = 0 - end #case statement - rescue => error - @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 formetric value") - return 0 - end - return metricValue - end # getMetricNumericValue + def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + metricItems = [] + begin + metricInfo = metricJSON + clusterId = getClusterId + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + metricInfo["items"].each do |node| + if (!node["status"][metricCategory].nil?) + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = node["metadata"]["name"] + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #push node level metrics to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end end - end + rescue => error + @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItems + end #parseNodeLimits + def getMetricNumericValue(metricName, metricVal) + metricValue = metricVal + begin + case metricName + when "memory" #convert to bytes for memory + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ + if (metricValue.end_with?("Ki")) + metricValue.chomp!("Ki") + metricValue = Float(metricValue) * 1024.0 ** 1 + elsif (metricValue.end_with?("Mi")) + metricValue.chomp!("Mi") + metricValue = Float(metricValue) * 1024.0 ** 2 + elsif (metricValue.end_with?("Gi")) + metricValue.chomp!("Gi") + metricValue = Float(metricValue) * 1024.0 ** 3 + elsif (metricValue.end_with?("Ti")) + metricValue.chomp!("Ti") + metricValue = Float(metricValue) * 1024.0 ** 4 + elsif (metricValue.end_with?("Pi")) + metricValue.chomp!("Pi") + metricValue = Float(metricValue) * 1024.0 ** 5 + elsif (metricValue.end_with?("Ei")) + metricValue.chomp!("Ei") + metricValue = Float(metricValue) * 1024.0 ** 6 + elsif (metricValue.end_with?("Zi")) + metricValue.chomp!("Zi") + metricValue = Float(metricValue) * 1024.0 ** 7 + elsif (metricValue.end_with?("Yi")) + metricValue.chomp!("Yi") + metricValue = Float(metricValue) * 1024.0 ** 8 + elsif (metricValue.end_with?("K")) + metricValue.chomp!("K") + metricValue = Float(metricValue) * 1000.0 ** 1 + elsif (metricValue.end_with?("M")) + metricValue.chomp!("M") + metricValue = Float(metricValue) * 1000.0 ** 2 + elsif (metricValue.end_with?("G")) + metricValue.chomp!("G") + metricValue = Float(metricValue) * 1000.0 ** 3 + elsif (metricValue.end_with?("T")) + metricValue.chomp!("T") + metricValue = Float(metricValue) * 1000.0 ** 4 + elsif (metricValue.end_with?("P")) + metricValue.chomp!("P") + metricValue = Float(metricValue) * 1000.0 ** 5 + elsif (metricValue.end_with?("E")) + metricValue.chomp!("E") + metricValue = Float(metricValue) * 1000.0 ** 6 + elsif (metricValue.end_with?("Z")) + metricValue.chomp!("Z") + metricValue = Float(metricValue) * 1000.0 ** 7 + elsif (metricValue.end_with?("Y")) + metricValue.chomp!("Y") + metricValue = Float(metricValue) * 1000.0 ** 8 + else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) + end + when "cpu" #convert to nanocores for cpu + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/ + if (metricValue.end_with?("m")) + metricValue.chomp!("m") + metricValue = Float(metricValue) * 1000.0 ** 2 + else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) * 1000.0 ** 3 + end + else + @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") + metricValue = 0 + end #case statement + rescue => error + @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 formetric value") + return 0 + end + return metricValue + end # getMetricNumericValue + end +end diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index a857aa6b9..f5f65f01b 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -2,90 +2,88 @@ # frozen_string_literal: true module Fluent - - class CAdvisor_Perf_Input < Input - Plugin.register_input('cadvisorperf', self) - - def initialize - super - require 'yaml' - require 'json' - - require_relative 'CAdvisorMetricsAPIClient' - require_relative 'oms_common' - require_relative 'omslog' - end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.api.cadvisorperf" - config_param :mdmtag, :string, :default => "mdm.cadvisorperf" - - def configure (conf) - super + class CAdvisor_Perf_Input < Input + Plugin.register_input("cadvisorperf", self) + + def initialize + super + require "yaml" + require "json" + + require_relative "CAdvisorMetricsAPIClient" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.api.cadvisorperf" + config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join + end + + def enumerate() + time = Time.now.to_f + begin + eventStream = MultiEventStream.new + metricData = CAdvisorMetricsAPIClient.getMetrics() + metricData.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + eventStream.add(time, record) if record + #router.emit(@tag, time, record) if record end - end - - def enumerate() - time = Time.now.to_f - begin - eventStream = MultiEventStream.new - metricData = CAdvisorMetricsAPIClient.getMetrics() - metricData.each do |record| - record['DataType'] = "LINUX_PERF_BLOB" - record['IPName'] = "LogManagement" - eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end - - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@mdmtag, eventStream) if eventStream - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => errorStr - $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) + + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + rescue => errorStr + $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics: #{errorStr}" - end + @mutex.unlock + if !done + begin + $log.info("in_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics: #{errorStr}" end - @mutex.lock end - @mutex.unlock + @mutex.lock end - end # CAdvisor_Perf_Input + @mutex.unlock + end + end # CAdvisor_Perf_Input end # module - diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index f501421a2..4d83278a9 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -2,29 +2,28 @@ # frozen_string_literal: true module Fluent - class Container_Inventory_Input < Input - Plugin.register_input('containerinventory', self) + Plugin.register_input("containerinventory", self) - @@PluginName = 'ContainerInventory' - @@RunningState = 'Running' - @@FailedState = 'Failed' - @@StoppedState = 'Stopped' - @@PausedState = 'Paused' + @@PluginName = "ContainerInventory" + @@RunningState = "Running" + @@FailedState = "Failed" + @@StoppedState = "Stopped" + @@PausedState = "Paused" def initialize super - require 'json' - require_relative 'DockerApiClient' - require_relative 'ContainerInventoryState' - require_relative 'ApplicationInsightsUtility' - require_relative 'omslog' + require "json" + require_relative "DockerApiClient" + require_relative "ContainerInventoryState" + require_relative "ApplicationInsightsUtility" + require_relative "omslog" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.containerinventory" - - def configure (conf) + + def configure(conf) super end @@ -50,16 +49,16 @@ def shutdown def obtainContainerConfig(instance, container) begin - configValue = container['Config'] + configValue = container["Config"] if !configValue.nil? - instance['ContainerHostname'] = configValue['Hostname'] + instance["ContainerHostname"] = configValue["Hostname"] - envValue = configValue['Env'] + envValue = configValue["Env"] envValueString = (envValue.nil?) ? "" : envValue.to_s # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) envValueString = ["AZMON_COLLECT_ENV=FALSE"] - $log.warn("Environment Variable collection for container: #{container['Id']} skipped because AZMON_COLLECT_ENV is set to false") + $log.warn("Environment Variable collection for container: #{container["Id"]} skipped because AZMON_COLLECT_ENV is set to false") end # Restricting the ENV string value to 200kb since the size of this string can go very high if envValueString.length > 200000 @@ -68,88 +67,88 @@ def obtainContainerConfig(instance, container) if !lastIndex.nil? envValueStringTruncated = envValueStringTruncated.slice(0..lastIndex) + "]" end - instance['EnvironmentVar'] = envValueStringTruncated + instance["EnvironmentVar"] = envValueStringTruncated else - instance['EnvironmentVar'] = envValueString + instance["EnvironmentVar"] = envValueString end - cmdValue = configValue['Cmd'] + cmdValue = configValue["Cmd"] cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s - instance['Command'] = cmdValueString + instance["Command"] = cmdValueString - instance['ComposeGroup'] = "" - labelsValue = configValue['Labels'] + instance["ComposeGroup"] = "" + labelsValue = configValue["Labels"] if !labelsValue.nil? && !labelsValue.empty? - instance['ComposeGroup'] = labelsValue['com.docker.compose.project'] + instance["ComposeGroup"] = labelsValue["com.docker.compose.project"] end else - $log.warn("Attempt in ObtainContainerConfig to get container: #{container['Id']} config information returned null") - end - rescue => errorStr - $log.warn("Exception in obtainContainerConfig: #{errorStr}") + $log.warn("Attempt in ObtainContainerConfig to get container: #{container["Id"]} config information returned null") end + rescue => errorStr + $log.warn("Exception in obtainContainerConfig: #{errorStr}") + end end def obtainContainerState(instance, container) begin - stateValue = container['State'] + stateValue = container["State"] if !stateValue.nil? - exitCodeValue = stateValue['ExitCode'] + exitCodeValue = stateValue["ExitCode"] # Exit codes less than 0 are not supported by the engine if exitCodeValue < 0 - exitCodeValue = 128 - $log.info("obtainContainerState::Container: #{container['Id']} returned negative exit code") + exitCodeValue = 128 + $log.info("obtainContainerState::Container: #{container["Id"]} returned negative exit code") end - instance['ExitCode'] = exitCodeValue + instance["ExitCode"] = exitCodeValue if exitCodeValue > 0 - instance['State'] = @@FailedState + instance["State"] = @@FailedState else # Set the Container status : Running/Paused/Stopped - runningValue = stateValue['Running'] + runningValue = stateValue["Running"] if runningValue - pausedValue = stateValue['Paused'] + pausedValue = stateValue["Paused"] # Checking for paused within running is true state because docker returns true for both Running and Paused fields when the container is paused if pausedValue - instance['State'] = @@PausedState + instance["State"] = @@PausedState else - instance['State'] = @@RunningState + instance["State"] = @@RunningState end else - instance['State'] = @@StoppedState + instance["State"] = @@StoppedState end end - instance['StartedTime'] = stateValue['StartedAt'] - instance['FinishedTime'] = stateValue['FinishedAt'] + instance["StartedTime"] = stateValue["StartedAt"] + instance["FinishedTime"] = stateValue["FinishedAt"] else - $log.info("Attempt in ObtainContainerState to get container: #{container['Id']} state information returned null") + $log.info("Attempt in ObtainContainerState to get container: #{container["Id"]} state information returned null") end - rescue => errorStr - $log.warn("Exception in obtainContainerState: #{errorStr}") + rescue => errorStr + $log.warn("Exception in obtainContainerState: #{errorStr}") end end def obtainContainerHostConfig(instance, container) begin - hostConfig = container['HostConfig'] + hostConfig = container["HostConfig"] if !hostConfig.nil? - links = hostConfig['Links'] - instance['Links'] = "" + links = hostConfig["Links"] + instance["Links"] = "" if !links.nil? linksString = links.to_s - instance['Links'] = (linksString == "null")? "" : linksString + instance["Links"] = (linksString == "null") ? "" : linksString end - portBindings = hostConfig['PortBindings'] - instance['Ports'] = "" + portBindings = hostConfig["PortBindings"] + instance["Ports"] = "" if !portBindings.nil? portBindingsString = portBindings.to_s - instance['Ports'] = (portBindingsString == "null")? "" : portBindingsString + instance["Ports"] = (portBindingsString == "null") ? "" : portBindingsString end else - $log.info("Attempt in ObtainContainerHostConfig to get container: #{container['Id']} host config information returned null") - end - rescue => errorStr - $log.warn("Exception in obtainContainerHostConfig: #{errorStr}") + $log.info("Attempt in ObtainContainerHostConfig to get container: #{container["Id"]} host config information returned null") end + rescue => errorStr + $log.warn("Exception in obtainContainerHostConfig: #{errorStr}") + end end def inspectContainer(id, nameMap) @@ -157,29 +156,29 @@ def inspectContainer(id, nameMap) begin container = DockerApiClient.dockerInspectContainer(id) if !container.nil? && !container.empty? - containerInstance['InstanceID'] = container['Id'] - containerInstance['CreatedTime'] = container['Created'] - containerName = container['Name'] + containerInstance["InstanceID"] = container["Id"] + containerInstance["CreatedTime"] = container["Created"] + containerName = container["Name"] if !containerName.nil? && !containerName.empty? # Remove the leading / from the name if it exists (this is an API issue) - containerInstance['ElementName'] = (containerName[0] == '/') ? containerName[1..-1] : containerName + containerInstance["ElementName"] = (containerName[0] == "/") ? containerName[1..-1] : containerName end - imageValue = container['Image'] + imageValue = container["Image"] if !imageValue.nil? && !imageValue.empty? - containerInstance['ImageId'] = imageValue + containerInstance["ImageId"] = imageValue repoImageTagArray = nameMap[imageValue] if nameMap.has_key? imageValue - containerInstance['Repository'] = repoImageTagArray[0] - containerInstance['Image'] = repoImageTagArray[1] - containerInstance['ImageTag'] = repoImageTagArray[2] + containerInstance["Repository"] = repoImageTagArray[0] + containerInstance["Image"] = repoImageTagArray[1] + containerInstance["ImageTag"] = repoImageTagArray[2] end end - obtainContainerConfig(containerInstance, container); - obtainContainerState(containerInstance, container); - obtainContainerHostConfig(containerInstance, container); + obtainContainerConfig(containerInstance, container) + obtainContainerState(containerInstance, container) + obtainContainerHostConfig(containerInstance, container) end rescue => errorStr - $log.warn("Exception in inspectContainer: #{errorStr} for container: #{id}") + $log.warn("Exception in inspectContainer: #{errorStr} for container: #{id}") end return containerInstance end @@ -199,8 +198,8 @@ def enumerate containerIds.each do |containerId| inspectedContainer = {} inspectedContainer = inspectContainer(containerId, nameMap) - inspectedContainer['Computer'] = hostname - inspectedContainer['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated + inspectedContainer["Computer"] = hostname + inspectedContainer["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated containerInventory.push inspectedContainer ContainerInventoryState.writeContainerState(inspectedContainer) end @@ -210,8 +209,8 @@ def enumerate deletedContainers.each do |deletedContainer| container = ContainerInventoryState.readContainerState(deletedContainer) if !container.nil? - container.each{|k,v| container[k]=v} - container['State'] = "Deleted" + container.each { |k, v| container[k] = v } + container["State"] = "Deleted" containerInventory.push container end end @@ -219,28 +218,28 @@ def enumerate containerInventory.each do |record| wrapper = { - "DataType"=>"CONTAINER_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("containerInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - @@telemetryTimeTracker = DateTime.now.to_time.to_i - telemetryProperties = {} - telemetryProperties['Computer'] = hostname - telemetryProperties['ContainerCount'] = containerInventory.length - ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) - end $log.info("in_container_inventory::enumerate : Processing complete - emitted stream @ #{Time.now.utc.iso8601}") end + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@telemetryTimeTracker = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties["Computer"] = hostname + telemetryProperties["ContainerCount"] = containerInventory.length + ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) + end rescue => errorStr $log.warn("Exception in enumerate container inventory: #{errorStr}") end @@ -265,7 +264,5 @@ def run_periodic end @mutex.unlock end - end # Container_Inventory_Input - -end # module \ No newline at end of file +end # module diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index ba1dacbe0..aabda441e 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -2,181 +2,176 @@ # frozen_string_literal: true module Fluent + class Kube_nodeInventory_Input < Input + Plugin.register_input("kubenodeinventory", self) - class Kube_nodeInventory_Input < Input - Plugin.register_input('kubenodeinventory', self) - - @@ContainerNodeInventoryTag = 'oms.containerinsights.ContainerNodeInventory' - @@MDMKubeNodeInventoryTag = 'mdm.kubenodeinventory' + @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" + @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" - def initialize - super - require 'yaml' - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - require_relative 'oms_common' - require_relative 'omslog' + def initialize + super + require "yaml" + require "json" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" - - def configure (conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end - end - - def enumerate - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - telemetrySent = false - $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('nodes').body) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - begin - if(!nodeInventory.empty?) - eventStream = MultiEventStream.new - containerNodeInventoryEventStream = MultiEventStream.new - #get node inventory - nodeInventory['items'].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord['Computer'] = items['metadata']['name'] + end - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['Computer'] = items['metadata']['name'] - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ClusterId'] = KubernetesApiClient.getClusterId - record['CreationTimeStamp'] = items['metadata']['creationTimestamp'] - record['Labels'] = [items['metadata']['labels']] - record['Status'] = "" + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. - - if items['status'].key?("conditions") && !items['status']['conditions'].empty? - allNodeConditions="" - items['status']['conditions'].each do |condition| - if condition['status'] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition['type'] - else - allNodeConditions = condition['type'] - end - end - #collect last transition to/from ready (no matter ready is true/false) - if condition['type'] == "Ready" && !condition['lastTransitionTime'].nil? - record['LastTransitionTimeReady'] = condition['lastTransitionTime'] - end - end - if !allNodeConditions.empty? - record['Status'] = allNodeConditions - end + def enumerate + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + telemetrySent = false + $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + begin + if (!nodeInventory.empty?) + eventStream = MultiEventStream.new + containerNodeInventoryEventStream = MultiEventStream.new + #get node inventory + nodeInventory["items"].each do |items| + record = {} + # Sending records for ContainerNodeInventory + containerNodeInventoryRecord = {} + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - end + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = items["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] + record["Labels"] = [items["metadata"]["labels"]] + record["Status"] = "" - nodeInfo = items['status']['nodeInfo'] - record['KubeletVersion'] = nodeInfo['kubeletVersion'] - record['KubeProxyVersion'] = nodeInfo['kubeProxyVersion'] - containerNodeInventoryRecord['OperatingSystem'] = nodeInfo['osImage'] - dockerVersion = nodeInfo['containerRuntimeVersion'] - dockerVersion.slice! "docker://" - containerNodeInventoryRecord['DockerVersion'] = dockerVersion - # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryWrapper = { - "DataType"=>"CONTAINER_NODE_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[containerNodeInventoryRecord.each{|k,v| containerNodeInventoryRecord[k]=v}] - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. - wrapper = { - "DataType"=>"KUBE_NODE_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - eventStream.add(emitTime, wrapper) if wrapper - # Adding telemetry to send node telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - capacityInfo = items['status']['capacity'] - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"] , properties) - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"] , properties) - telemetrySent = true - end - end - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - if telemetrySent == true - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + if items["status"].key?("conditions") && !items["status"]["conditions"].empty? + allNodeConditions = "" + items["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end end - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] end - end - rescue => errorStr - $log.warn "Failed to retrieve node inventory: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kube_nodes::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + if !allNodeConditions.empty? + record["Status"] = allNodeConditions + end end + + nodeInfo = items["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + dockerVersion = nodeInfo["containerRuntimeVersion"] + dockerVersion.slice! "docker://" + containerNodeInventoryRecord["DockerVersion"] = dockerVersion + # ContainerNodeInventory data for docker version and operating system. + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + # Adding telemetry to send node telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + properties = {} + properties["Computer"] = record["Computer"] + properties["KubeletVersion"] = record["KubeletVersion"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] + properties["DockerVersion"] = dockerVersion + capacityInfo = items["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + telemetrySent = true + end + end + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + if telemetrySent == true + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + end + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - @mutex.lock end + rescue => errorStr + $log.warn "Failed to retrieve node inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished @mutex.unlock + if !done + begin + $log.info("in_kube_nodes::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock end - - end # Kube_Node_Input - - end # module - - \ No newline at end of file + @mutex.unlock + end + end # Kube_Node_Input +end # module diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 3d026b05f..65573673c 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -2,29 +2,28 @@ # frozen_string_literal: true module Fluent - class Kube_PodInventory_Input < Input - Plugin.register_input('kubepodinventory', self) + Plugin.register_input("kubepodinventory", self) - @@MDMKubePodInventoryTag = 'mdm.kubepodinventory' + @@MDMKubePodInventoryTag = "mdm.kubepodinventory" + @@hostName = (OMS::Common.get_hostname) def initialize super - require 'yaml' - require 'json' - require 'set' - - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - require_relative 'oms_common' - require_relative 'omslog' + require "yaml" + require "json" + require "set" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" - def configure (conf) + def configure(conf) super end @@ -48,29 +47,126 @@ def shutdown end end - def enumerate(podList = nil) - if podList.nil? - $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('pods').body) - $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + def enumerate(podList = nil) + if podList.nil? + $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods").body) + $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + else + podInventory = podList + end + begin + if (!podInventory.empty? && podInventory.key?("items") && !podInventory["items"].empty?) + #get pod inventory & services + $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") + serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, serviceList) else - podInventory = podList + $log.warn "Received empty podInventory" + end + rescue => errorStr + $log.warn "Failed in enumerate pod inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def populateWindowsContainerInventoryRecord(container, record, containerEnvVariableHash, batchTime) + begin + containerInventoryRecord = {} + containerName = container["name"] + containerInventoryRecord["InstanceID"] = record["ContainerID"] + containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerInventoryRecord["Computer"] = record["Computer"] + containerInventoryRecord["ContainerHostname"] = record["Computer"] + containerInventoryRecord["ElementName"] = containerName + image = container["image"] + repoInfo = image.split("/") + if !repoInfo.nil? + containerInventoryRecord["Repository"] = repoInfo[0] + if !repoInfo[1].nil? + imageInfo = repoInfo[1].split(":") + if !imageInfo.nil? + containerInventoryRecord["Image"] = imageInfo[0] + containerInventoryRecord["ImageTag"] = imageInfo[1] + end + end + end + imageIdInfo = container["imageID"] + imageIdSplitInfo = imageIdInfo.split("@") + if !imageIdSplitInfo.nil? + containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] + end + # Get container state + containerStatus = container["state"] + if containerStatus.keys[0] == "running" + containerInventoryRecord["State"] = "Running" + containerInventoryRecord["StartedTime"] = container["state"]["running"]["startedAt"] + elsif containerStatus.keys[0] == "terminated" + containerExitCode = container["state"]["terminated"]["exitCode"] + containerStartTime = container["state"]["terminated"]["startedAt"] + containerFinishTime = container["state"]["terminated"]["finishedAt"] + if containerExitCode < 0 + # Exit codes less than 0 are not supported by the engine + containerExitCode = 128 + end + if containerExitCode > 0 + containerInventoryRecord["State"] = "Failed" + else + containerInventoryRecord["State"] = "Stopped" + end + containerInventoryRecord["ExitCode"] = containerExitCode + containerInventoryRecord["StartedTime"] = containerStartTime + containerInventoryRecord["FinishedTime"] = containerFinishTime + elsif containerStatus.keys[0] == "waiting" + containerInventoryRecord["State"] = "Waiting" + end + if !containerEnvVariableHash.nil? && !containerEnvVariableHash.empty? + containerInventoryRecord["EnvironmentVar"] = containerEnvVariableHash[containerName] end - begin - if(!podInventory.empty? && podInventory.key?("items") && !podInventory['items'].empty?) - #get pod inventory & services - $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo('services').body) - $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceList) - else - $log.warn "Received empty podInventory" - end - rescue => errorStr - $log.warn "Failed in enumerate pod inventory: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + return containerInventoryRecord + rescue => errorStr + $log.warn "Failed in populateWindowsContainerInventoryRecord: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def getContainerEnvironmentVariables(pod) + begin + podSpec = pod["spec"] + containerEnvHash = {} + if !podSpec.nil? && !podSpec["containers"].nil? + podSpec["containers"].each do |container| + envVarsArray = [] + containerEnvArray = container["env"] + # Parsing the environment variable array of hashes to a string value + # since that is format being sent by container inventory workflow in daemonset + # Keeping it in the same format because the workflow expects it in this format + # and the UX expects an array of string for environment variables + if !containerEnvArray.nil? && !containerEnvArray.empty? + containerEnvArray.each do |envVarHash| + envName = envVarHash["name"] + envValue = envVarHash["value"] + envArrayElement = envName + "=" + envValue + envVarsArray.push(envArrayElement) + end + end + # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE + envValueString = envVarsArray.to_s + if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) + envValueString = ["AZMON_COLLECT_ENV=FALSE"] + end + containerEnvHash[container["name"]] = envValueString + end + end + return containerEnvHash + rescue => errorStr + $log.warn "Failed in getContainerEnvironmentVariables: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end end def parse_and_emit_records(podInventory, serviceList) @@ -80,100 +176,116 @@ def parse_and_emit_records(podInventory, serviceList) eventStream = MultiEventStream.new controllerSet = Set.new [] telemetryFlush = false + winContainerCount = 0 begin #begin block start - podInventory['items'].each do |items| #podInventory block start + # Getting windows nodes from kubeapi + winNodes = KubernetesApiClient.getWindowsNodesArray + + podInventory["items"].each do |items| #podInventory block start + sendWindowsContainerInventoryRecord = false + containerInventoryRecords = [] records = [] record = {} - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['Name'] = items['metadata']['name'] - podNameSpace = items['metadata']['namespace'] - - if podNameSpace.eql?("kube-system") && !items['metadata'].key?("ownerReferences") + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Name"] = items["metadata"]["name"] + podNameSpace = items["metadata"]["namespace"] + + if podNameSpace.eql?("kube-system") && !items["metadata"].key?("ownerReferences") # The above case seems to be the only case where you have horizontal scaling of pods # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash # instead of the actual poduid. Since this uid is not being surface into the UX # its ok to use this. # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = items['metadata']['annotations']['kubernetes.io/config.hash'] + podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] else - podUid = items['metadata']['uid'] + podUid = items["metadata"]["uid"] end - record['PodUid'] = podUid - record['PodLabel'] = [items['metadata']['labels']] - record['Namespace'] = podNameSpace - record['PodCreationTimeStamp'] = items['metadata']['creationTimestamp'] + record["PodUid"] = podUid + record["PodLabel"] = [items["metadata"]["labels"]] + record["Namespace"] = podNameSpace + record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] #for unscheduled (non-started) pods startTime does NOT exist - if !items['status']['startTime'].nil? - record['PodStartTime'] = items['status']['startTime'] + if !items["status"]["startTime"].nil? + record["PodStartTime"] = items["status"]["startTime"] else - record['PodStartTime'] = "" + record["PodStartTime"] = "" end #podStatus # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running podReadyCondition = true - if !items['status']['reason'].nil? && items['status']['reason'] == "NodeLost" && !items['status']['conditions'].nil? - items['status']['conditions'].each do |condition| - if condition['type'] == "Ready" && condition['status'] == "False" + if !items["status"]["reason"].nil? && items["status"]["reason"] == "NodeLost" && !items["status"]["conditions"].nil? + items["status"]["conditions"].each do |condition| + if condition["type"] == "Ready" && condition["status"] == "False" podReadyCondition = false break end end end if podReadyCondition == false - record['PodStatus'] = "Unknown" + record["PodStatus"] = "Unknown" else - record['PodStatus'] = items['status']['phase'] + record["PodStatus"] = items["status"]["phase"] end #for unscheduled (non-started) pods podIP does NOT exist - if !items['status']['podIP'].nil? - record['PodIp'] =items['status']['podIP'] + if !items["status"]["podIP"].nil? + record["PodIp"] = items["status"]["podIP"] else - record['PodIp'] = "" + record["PodIp"] = "" end #for unscheduled (non-started) pods nodeName does NOT exist - if !items['spec']['nodeName'].nil? - record['Computer'] = items['spec']['nodeName'] + if !items["spec"]["nodeName"].nil? + record["Computer"] = items["spec"]["nodeName"] else - record['Computer'] = "" - end - record['ClusterId'] = KubernetesApiClient.getClusterId - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ServiceName'] = getServiceNameFromLabels(items['metadata']['namespace'], items['metadata']['labels'], serviceList) - # Adding telemetry to send pod telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - telemetryFlush = true - end - if !items['metadata']['ownerReferences'].nil? - record['ControllerKind'] = items['metadata']['ownerReferences'][0]['kind'] - record['ControllerName'] = items['metadata']['ownerReferences'][0]['name'] + record["Computer"] = "" + end + + # Setting this flag to true so that we can send ContainerInventory records for containers + # on windows nodes and parse environment variables for these containers + if winNodes.length > 0 + if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + sendWindowsContainerInventoryRecord = true + containerEnvVariableHash = getContainerEnvironmentVariables(items) + end + end + + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) + # Adding telemetry to send pod telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + telemetryFlush = true + end + if !items["metadata"]["ownerReferences"].nil? + record["ControllerKind"] = items["metadata"]["ownerReferences"][0]["kind"] + record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] if telemetryFlush == true - controllerSet.add(record['ControllerKind'] + record['ControllerName']) + controllerSet.add(record["ControllerKind"] + record["ControllerName"]) end end podRestartCount = 0 - record['PodRestartCount'] = 0 - if items['status'].key?("containerStatuses") && !items['status']['containerStatuses'].empty? #container status block start - items['status']['containerStatuses'].each do |container| - containerRestartCount = 0 - #container Id is of the form - #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 - if !container['containerID'].nil? - record['ContainerID'] = container['containerID'].split("//")[1] - else + record["PodRestartCount"] = 0 + if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start + items["status"]["containerStatuses"].each do |container| + containerRestartCount = 0 + #container Id is of the form + #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 + if !container["containerID"].nil? + record["ContainerID"] = container["containerID"].split("//")[1] + else # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 - record['ContainerID'] = "" + record["ContainerID"] = "" end - #keeping this as which is same as InstanceName in perf table - record['ContainerName'] = podUid + "/" +container['name'] - #Pod restart count is a sumtotal of restart counts of individual containers - #within the pod. The restart count of a container is maintained by kubernetes - #itself in the form of a container label. - containerRestartCount = container['restartCount'] - record['ContainerRestartCount'] = containerRestartCount - containerStatus = container['state'] - record['ContainerStatusReason'] = '' + #keeping this as which is same as InstanceName in perf table + record["ContainerName"] = podUid + "/" + container["name"] + #Pod restart count is a sumtotal of restart counts of individual containers + #within the pod. The restart count of a container is maintained by kubernetes + #itself in the form of a container label. + containerRestartCount = container["restartCount"] + record["ContainerRestartCount"] = containerRestartCount + containerStatus = container["state"] + record["ContainerStatusReason"] = "" # state is of the following form , so just picking up the first key name # "state": { # "waiting": { @@ -183,55 +295,80 @@ def parse_and_emit_records(podInventory, serviceList) # }, # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running if podReadyCondition == false - record['ContainerStatus'] = "Unknown" + record["ContainerStatus"] = "Unknown" else - record['ContainerStatus'] = containerStatus.keys[0] + record["ContainerStatus"] = containerStatus.keys[0] end #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric #Picking up both container and node start time from cAdvisor to be consistent if containerStatus.keys[0] == "running" - record['ContainerCreationTimeStamp'] = container['state']['running']['startedAt'] + record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] else - if !containerStatus[containerStatus.keys[0]]['reason'].nil? && !containerStatus[containerStatus.keys[0]]['reason'].empty? - record['ContainerStatusReason'] = containerStatus[containerStatus.keys[0]]['reason'] + if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? + record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] end end - podRestartCount += containerRestartCount - records.push(record.dup) - end + podRestartCount += containerRestartCount + records.push(record.dup) + + #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel + if sendWindowsContainerInventoryRecord == true + containerInventoryRecord = populateWindowsContainerInventoryRecord(container, record, containerEnvVariableHash, batchTime) + containerInventoryRecords.push(containerInventoryRecord) + end + end else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod - records.push(record) + records.push(record) end #container status block end records.each do |record| if !record.nil? - record['PodRestartCount'] = podRestartCount + record["PodRestartCount"] = podRestartCount wrapper = { - "DataType"=>"KUBE_POD_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] + "DataType" => "KUBE_POD_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper - end - end + end + end + # Send container inventory records for containers on windows nodes + winContainerCount += containerInventoryRecords.length + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + ciwrapper = { + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], + } + eventStream.add(emitTime, ciwrapper) if ciwrapper + end + end end #podInventory block end + router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@@MDMKubePodInventoryTag, eventStream) if eventStream if telemetryFlush == true - ApplicationInsightsUtility.sendHeartBeatEvent("KubePodInventory") - ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory['items'].length , {}) - ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length , {}) + telemetryProperties = {} + telemetryProperties["Computer"] = @@hostName + ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, {}) + if winContainerCount > 0 + telemetryProperties["ClusterWideWindowsContainersCount"] = winContainerCount + ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) + end @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - rescue => errorStr + rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end #begin block end - end + end #begin block end + end def run_periodic @mutex.lock @@ -257,37 +394,33 @@ def run_periodic def getServiceNameFromLabels(namespace, labels, serviceList) serviceName = "" begin - if !labels.nil? && !labels.empty? - if( !serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList['items'].empty?) - serviceList['items'].each do |item| + if !labels.nil? && !labels.empty? + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].empty?) + serviceList["items"].each do |item| found = 0 - if !item['spec'].nil? && !item['spec']['selector'].nil? && item['metadata']['namespace'] == namespace - selectorLabels = item['spec']['selector'] + if !item["spec"].nil? && !item["spec"]["selector"].nil? && item["metadata"]["namespace"] == namespace + selectorLabels = item["spec"]["selector"] if !selectorLabels.empty? - selectorLabels.each do |key,value| - if !(labels.select {|k,v| k==key && v==value}.length > 0) + selectorLabels.each do |key, value| + if !(labels.select { |k, v| k == key && v == value }.length > 0) break end found = found + 1 end - end + end if found == selectorLabels.length - return item['metadata']['name'] + return item["metadata"]["name"] end - end + end end - end + end end - rescue => errorStr + rescue => errorStr $log.warn "Failed to retrieve service name from labels: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName end - end # Kube_Pod_Input - end # module - - diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb new file mode 100644 index 000000000..2e5f839e6 --- /dev/null +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -0,0 +1,120 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + class Win_CAdvisor_Perf_Input < Input + Plugin.register_input("wincadvisorperf", self) + + @@winNodes = [] + + def initialize + super + require "yaml" + require "json" + + require_relative "CAdvisorMetricsAPIClient" + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.api.wincadvisorperf" + config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate() + time = Time.now.to_f + begin + eventStream = MultiEventStream.new + timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + + #Resetting this cache so that it is populated with the current set of containers with every call + CAdvisorMetricsAPIClient.resetWinContainerIdCache() + if (timeDifferenceInMinutes >= 5) + $log.info "in_win_cadvisor_perf: Getting windows nodes" + nodes = KubernetesApiClient.getWindowsNodes() + if !nodes.nil? + @@winNodes = KubernetesApiClient.getWindowsNodes() + end + $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" + @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i + end + @@winNodes.each do |winNode| + metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) + metricData.each do |record| + if !record.empty? + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + eventStream.add(time, record) if record + end + end + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream + + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + # Cleanup routine to clear deleted containers from cache + cleanupTimeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs + cleanupTimeDifferenceInMinutes = cleanupTimeDifference / 60 + if (cleanupTimeDifferenceInMinutes >= 5) + $log.info "in_win_cadvisor_perf : Cleanup routine kicking in to clear deleted containers from cache" + CAdvisorMetricsAPIClient.clearDeletedWinContainersFromCache() + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + $log.warn "Failed to retrieve cadvisor metric data for windows nodes: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_win_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_win_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics for windows nodes: #{errorStr}" + end + end + @mutex.lock + end + @mutex.unlock + end + end # Win_CAdvisor_Perf_Input +end # module diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 93b32ef50..963069858 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -2,29 +2,27 @@ # frozen_string_literal: true module Fluent - class OutputMDM < BufferedOutput - config_param :retry_mdm_post_wait_minutes, :integer - Plugin.register_output('out_mdm', self) + Plugin.register_output("out_mdm", self) def initialize super - require 'net/http' - require 'net/https' - require 'uri' - require 'json' - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' + require "net/http" + require "net/https" + require "uri" + require "json" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" - @@token_resource_url = 'https://monitoring.azure.com/' - @@grant_type = 'client_credentials' - @@azure_json_path = '/etc/kubernetes/host/azure.json' + @@token_resource_url = "https://monitoring.azure.com/" + @@grant_type = "client_credentials" + @@azure_json_path = "/etc/kubernetes/host/azure.json" @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" @@token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token" @@plugin_name = "AKSCustomMetricsMDM" - + @data_hash = {} @token_url = nil @http_client = nil @@ -50,12 +48,13 @@ def start @can_send_data_to_mdm = false return end - # Handle the case where the file read fails. Send Telemetry and exit the plugin? + # Handle the case where the file read fails. Send Telemetry and exit the plugin? @data_hash = JSON.parse(file) - @token_url = @@token_url_template % {tenant_id: @data_hash['tenantId']} + @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} @cached_access_token = get_access_token - aks_resource_id = ENV['AKS_RESOURCE_ID'] - aks_region = ENV['AKS_REGION'] + aks_resource_id = ENV["AKS_RESOURCE_ID"] + aks_region = ENV["AKS_REGION"] + if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " @can_send_data_to_mdm = false @@ -77,7 +76,7 @@ def start # get the access token only if the time to expiry is less than 5 minutes def get_access_token - if @cached_access_token.to_s.empty? || (Time.now + 5*60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration + if @cached_access_token.to_s.empty? || (Time.now + 5 * 60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration @log.info "Refreshing access token for out_mdm plugin.." token_uri = URI.parse(@token_url) http_access_token = Net::HTTP.new(token_uri.host, token_uri.port) @@ -85,27 +84,27 @@ def get_access_token token_request = Net::HTTP::Post.new(token_uri.request_uri) token_request.set_form_data( { - 'grant_type' => @@grant_type, - 'client_id' => @data_hash['aadClientId'], - 'client_secret' => @data_hash['aadClientSecret'], - 'resource' => @@token_resource_url - } + "grant_type" => @@grant_type, + "client_id" => @data_hash["aadClientId"], + "client_secret" => @data_hash["aadClientSecret"], + "resource" => @@token_resource_url, + } ) - + token_response = http_access_token.request(token_request) - # Handle the case where the response is not 200 + # Handle the case where the response is not 200 parsed_json = JSON.parse(token_response.body) - @token_expiry_time = Time.now + 59*60 # set the expiry time to be ~one hour from current time - @cached_access_token = parsed_json['access_token'] + @token_expiry_time = Time.now + 59 * 60 # set the expiry time to be ~one hour from current time + @cached_access_token = parsed_json["access_token"] end @cached_access_token - end + end def write_status_file(success, message) - fn = '/var/opt/microsoft/omsagent/log/MDMIngestion.status' + fn = "/var/opt/microsoft/omsagent/log/MDMIngestion.status" status = '{ "operation": "MDMIngestion", "success": "%s", "message": "%s" }' % [success, message] begin - File.open(fn,'w') { |file| file.write(status) } + File.open(fn, "w") { |file| file.write(status) } rescue => e @log.debug "Error:'#{e}'" ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) @@ -123,13 +122,13 @@ def format(tag, time, record) end end - # This method is called every flush interval. Send the buffer chunk to MDM. + # This method is called every flush interval. Send the buffer chunk to MDM. # 'chunk' is a buffer chunk that includes multiple formatted records def write(chunk) begin - if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes*60)) && @can_send_data_to_mdm + if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm post_body = [] - chunk.msgpack_each {|(tag, record)| + chunk.msgpack_each { |(tag, record)| post_body.push(record.to_json) } send_to_mdm post_body @@ -137,7 +136,7 @@ def write(chunk) if !@can_send_data_to_mdm @log.info "Cannot send data to MDM since all required conditions were not met" else - @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time)/60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" + @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time) / 60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" end end rescue Exception => e @@ -146,12 +145,12 @@ def write(chunk) end end - def send_to_mdm(post_body) + def send_to_mdm(post_body) begin access_token = get_access_token request = Net::HTTP::Post.new(@post_request_uri.request_uri) - request['Content-Type'] = "application/x-ndjson" - request['Authorization'] = "Bearer #{access_token}" + request["Content-Type"] = "application/x-ndjson" + request["Authorization"] = "Bearer #{access_token}" request.body = post_body.join("\n") response = @http_client.request(request) response.value # this throws for non 200 HTTP response code @@ -166,10 +165,10 @@ def send_to_mdm(post_body) @first_post_attempt_made = true ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) # Not raising exception, as that will cause retries to happen - elsif !response.code.empty? && response.code.start_with?('4') + elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" - else + else # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" raise e @@ -186,7 +185,8 @@ def send_to_mdm(post_body) raise e end end - private + + private class ChunkErrorHandler include Configurable @@ -218,20 +218,20 @@ def router=(r) end def write(chunk) - chunk.msgpack_each {|(tag, record)| + chunk.msgpack_each { |(tag, record)| @error_handlers[tag].emit(record) } end - - private + + private def create_error_handlers(router) nop_handler = NopErrorHandler.new Hash.new() { |hash, tag| etag = OMS::Common.create_error_tag tag hash[tag] = router.match?(etag) ? - ErrorHandler.new(router, etag) : - nop_handler + ErrorHandler.new(router, etag) : + nop_handler } end @@ -251,10 +251,6 @@ def emit(record) # NOP end end - end - end # class OutputMDM - end # module Fluent - From ebdd8cc119a77752fd543225878f36e055812d14 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 8 Apr 2019 11:55:52 -0700 Subject: [PATCH 084/160] adding os to container inventory for windows nodes (#210) --- source/code/plugin/CAdvisorMetricsAPIClient.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 8b4fd9fcf..35cf727cf 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -318,6 +318,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, telemetryProperties = {} telemetryProperties["Computer"] = hostName telemetryProperties["ContainerCount"] = containerCount + telemetryProperties["OS"] = "Windows" # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. @Log.info "sending container inventory heartbeat telemetry" ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) From d7b8cff1d9b20f3894fdd91c0e1cd3b69a465ed9 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 8 Apr 2019 15:40:31 -0700 Subject: [PATCH 085/160] Fix omsagent crash Error when kube-api returns non-200, send events for HTTP Errors (#211) * Fix omsagent crash Error when kube-api returns non-200, send events for HTTP Errors * Fixing the bug, deferring telemetry changes for later --- source/code/plugin/filter_cadvisor2mdm.rb | 102 +++++++++++----------- source/code/plugin/out_mdm.rb | 2 +- 2 files changed, 54 insertions(+), 50 deletions(-) diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index 94f2107cc..a6e643e45 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -10,45 +10,45 @@ module Fluent class CAdvisor2MdmFilter < Filter Fluent::Plugin.register_filter('filter_cadvisor2mdm', self) - + config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log' config_param :custom_metrics_azure_regions, :string config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes' - + @@cpu_usage_milli_cores = 'cpuUsageMillicores' @@cpu_usage_nano_cores = 'cpuusagenanocores' @@object_name_k8s_node = 'K8SNode' @@hostName = (OMS::Common.get_hostname) @@custom_metrics_template = ' - { - "time": "%{timestamp}", - "data": { - "baseData": { - "metric": "%{metricName}", - "namespace": "Insights.Container/nodes", - "dimNames": [ + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "Insights.Container/nodes", + "dimNames": [ "host" - ], - "series": [ - { - "dimValues": [ + ], + "series": [ + { + "dimValues": [ "%{hostvalue}" - ], + ], "min": %{metricminvalue}, - "max": %{metricmaxvalue}, - "sum": %{metricsumvalue}, - "count": 1 - } - ] - } - } + "max": %{metricmaxvalue}, + "sum": %{metricsumvalue}, + "count": 1 + } + ] + } + } }' - + @@metric_name_metric_percentage_name_hash = { - @@cpu_usage_milli_cores => "cpuUsagePercentage", + @@cpu_usage_milli_cores => "cpuUsagePercentage", "memoryRssBytes" => "memoryRssPercentage", - "memoryWorkingSetBytes" => "memoryWorkingSetPercentage" + "memoryWorkingSetBytes" => "memoryWorkingSetPercentage" } @process_incoming_stream = true @@ -61,7 +61,7 @@ def initialize def configure(conf) super @log = nil - + if @enable_log @log = Logger.new(@log_path, 1, 5000000) @log.debug {'Starting filter_cadvisor2mdm plugin'} @@ -70,15 +70,19 @@ def configure(conf) def start super - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) - @metrics_to_collect_hash = build_metrics_hash - @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" - - # initialize cpu and memory limit - if @process_incoming_stream - @cpu_capacity = 0.0 - @memory_capacity = 0.0 - ensure_cpu_memory_capacity_set + begin + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @metrics_to_collect_hash = build_metrics_hash + @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" + + # initialize cpu and memory limit + if @process_incoming_stream + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + ensure_cpu_memory_capacity_set + end + rescue => e + @log.info "Error initializing plugin #{e}" end end @@ -117,9 +121,9 @@ def filter(tag, time, record) if @memory_capacity != 0.0 percentage_metric_value = metric_value*100/@memory_capacity end - end + end return get_metric_records(record, metric_name, metric_value, percentage_metric_value) - else + else return [] end else @@ -140,13 +144,13 @@ def ensure_cpu_memory_capacity_set return end - begin + begin nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes?fieldSelector=metadata.name%3D#{@@hostName}").body) rescue Exception => e @log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) end - if !nodeInventory.nil? + if !nodeInventory.nil? cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") if !cpu_capacity_json.nil? && !cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? @cpu_capacity = cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] @@ -163,7 +167,7 @@ def ensure_cpu_memory_capacity_set end end end - + def get_metric_records(record, metric_name, metric_value, percentage_metric_value) records = [] custommetricrecord = @@custom_metrics_template % { @@ -194,20 +198,20 @@ def get_metric_records(record, metric_name, metric_value, percentage_metric_valu return records end - + def filter_stream(tag, es) new_es = MultiEventStream.new - ensure_cpu_memory_capacity_set - es.each { |time, record| - begin + begin + ensure_cpu_memory_capacity_set + es.each { |time, record| filtered_records = filter(tag, time, record) - filtered_records.each {|filtered_record| + filtered_records.each {|filtered_record| new_es.add(time, filtered_record) if filtered_record - } if filtered_records - rescue => e - router.emit_error_event(tag, time, record, e) - end - } + } if filtered_records + } + rescue => e + @log.info "Error in filter_stream #{e.message}" + end new_es end end diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 963069858..351198afe 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -140,6 +140,7 @@ def write(chunk) end end rescue Exception => e + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) @log.info "Exception when writing to MDM: #{e}" raise e end @@ -163,7 +164,6 @@ def send_to_mdm(post_body) @log.info "Response Code #{response.code} Updating @last_post_attempt_time" @last_post_attempt_time = Time.now @first_post_attempt_made = true - ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) # Not raising exception, as that will cause retries to happen elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue From c9bb623c2c0aa6642e0baab3b0ebcf313c4627eb Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 10 Apr 2019 16:28:47 -0700 Subject: [PATCH 086/160] updating to lowercase compare for units (#212) --- source/code/plugin/KubernetesApiClient.rb | 66 +++++++++++------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 4ed85025f..3c6b4f203 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -439,58 +439,58 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet end #parseNodeLimits def getMetricNumericValue(metricName, metricVal) - metricValue = metricVal + metricValue = metricVal.downcase begin case metricName when "memory" #convert to bytes for memory #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ - if (metricValue.end_with?("Ki")) - metricValue.chomp!("Ki") + if (metricValue.end_with?("ki")) + metricValue.chomp!("ki") metricValue = Float(metricValue) * 1024.0 ** 1 - elsif (metricValue.end_with?("Mi")) - metricValue.chomp!("Mi") + elsif (metricValue.end_with?("mi")) + metricValue.chomp!("mi") metricValue = Float(metricValue) * 1024.0 ** 2 - elsif (metricValue.end_with?("Gi")) - metricValue.chomp!("Gi") + elsif (metricValue.end_with?("gi")) + metricValue.chomp!("gi") metricValue = Float(metricValue) * 1024.0 ** 3 - elsif (metricValue.end_with?("Ti")) - metricValue.chomp!("Ti") + elsif (metricValue.end_with?("ti")) + metricValue.chomp!("ti") metricValue = Float(metricValue) * 1024.0 ** 4 - elsif (metricValue.end_with?("Pi")) - metricValue.chomp!("Pi") + elsif (metricValue.end_with?("pi")) + metricValue.chomp!("pi") metricValue = Float(metricValue) * 1024.0 ** 5 - elsif (metricValue.end_with?("Ei")) - metricValue.chomp!("Ei") + elsif (metricValue.end_with?("ei")) + metricValue.chomp!("ei") metricValue = Float(metricValue) * 1024.0 ** 6 - elsif (metricValue.end_with?("Zi")) - metricValue.chomp!("Zi") + elsif (metricValue.end_with?("zi")) + metricValue.chomp!("zi") metricValue = Float(metricValue) * 1024.0 ** 7 - elsif (metricValue.end_with?("Yi")) - metricValue.chomp!("Yi") + elsif (metricValue.end_with?("yi")) + metricValue.chomp!("yi") metricValue = Float(metricValue) * 1024.0 ** 8 - elsif (metricValue.end_with?("K")) - metricValue.chomp!("K") + elsif (metricValue.end_with?("k")) + metricValue.chomp!("k") metricValue = Float(metricValue) * 1000.0 ** 1 - elsif (metricValue.end_with?("M")) - metricValue.chomp!("M") + elsif (metricValue.end_with?("m")) + metricValue.chomp!("m") metricValue = Float(metricValue) * 1000.0 ** 2 - elsif (metricValue.end_with?("G")) - metricValue.chomp!("G") + elsif (metricValue.end_with?("g")) + metricValue.chomp!("g") metricValue = Float(metricValue) * 1000.0 ** 3 - elsif (metricValue.end_with?("T")) - metricValue.chomp!("T") + elsif (metricValue.end_with?("t")) + metricValue.chomp!("t") metricValue = Float(metricValue) * 1000.0 ** 4 - elsif (metricValue.end_with?("P")) - metricValue.chomp!("P") + elsif (metricValue.end_with?("p")) + metricValue.chomp!("p") metricValue = Float(metricValue) * 1000.0 ** 5 - elsif (metricValue.end_with?("E")) - metricValue.chomp!("E") + elsif (metricValue.end_with?("e")) + metricValue.chomp!("e") metricValue = Float(metricValue) * 1000.0 ** 6 - elsif (metricValue.end_with?("Z")) - metricValue.chomp!("Z") + elsif (metricValue.end_with?("z")) + metricValue.chomp!("z") metricValue = Float(metricValue) * 1000.0 ** 7 - elsif (metricValue.end_with?("Y")) - metricValue.chomp!("Y") + elsif (metricValue.end_with?("y")) + metricValue.chomp!("y") metricValue = Float(metricValue) * 1000.0 ** 8 else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') metricValue = Float(metricValue) From 3a88db8e5b1005564e54625959972e176835f9d4 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 16 Apr 2019 13:00:00 -0700 Subject: [PATCH 087/160] Merge from vishwa/telegraftcp to ci_feature for telegraf changes (#214) * merge from Vishwa/telegraf to Vishwa/telegraftcp for telegraf changes (#207) * add configuration for telegraf * fix for perms * fix telegraf config. * fix file location & config * update to config * fix namespace * trying different namespace and also debug=true * add placeholder for nodename * change namespace * updated config * fix uri * fix azMon settings * remove aad settings * add custom metrics regions * fix config * add support for replica-set config * fix oomkilled * Add telegraf 403 metric telemetry & non 403 trace telemetry * fix type * fix package * fix package import * fix filename * delete unused file * conf file for rs; fix 403counttotal metric for telegraf, remove host and use nodeName consistently, rename metrics * fix statefulsets * fix typo. * fix another typo. * fix telemetry * fix casing issue * fix comma issue. * disable telemetry for rs ; fix stateful set name * worksround for namespace fix * telegraf integration - v1 * telemetry changes for telegraf * telemetry & other changes * remove custom metric regions as we dont need anymore * remove un-needed files * fixes * exclude certain volumes and fix telemetry to not have computer & nodename as dimensions (redundant) * Vishwa/resourcecentric (#208) (#209) * resourceid fix (for AKS only) * fix name * near final metric shape * change from customlog to fixed type (InsightsMetrics) * fix PR feedback * fix pr feedback --- installer/conf/td-agent-bit.conf | 27 +- installer/conf/telegraf.conf | 519 ++++++++++++++++++ installer/datafiles/base_container.data | 3 + .../scripts/TelegrafTCPErrorTelemetry.sh | 3 + source/code/go/src/plugins/oms.go | 241 +++++++- source/code/go/src/plugins/out_oms.go | 22 +- source/code/go/src/plugins/telemetry.go | 26 +- 7 files changed, 821 insertions(+), 20 deletions(-) create mode 100644 installer/conf/telegraf.conf create mode 100644 installer/scripts/TelegrafTCPErrorTelemetry.sh diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 78a7b2dde..88bacaca2 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -23,10 +23,33 @@ Mem_Buf_Limit 2m Path_Key filepath Skip_Long_Lines On + Ignore_Older 5m + +[INPUT] + Name tail + Tag oms.container.log.telegraf.err.* + Path /var/opt/microsoft/docker-cimprov/log/telegraf.log + DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db + Mem_Buf_Limit 2m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 5m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25226 + Chunk_Size 32 + Buffer_Size 64 + +[FILTER] + Name grep + Match oms.container.log.telegraf.err.* + #Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ [OUTPUT] Name oms EnableTelemetry true TelemetryPushIntervalSeconds 300 - Match oms.container.log.* - AgentVersion ciprod03122019 \ No newline at end of file + Match oms.container.* diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf new file mode 100644 index 000000000..355c88b3d --- /dev/null +++ b/installer/conf/telegraf.conf @@ -0,0 +1,519 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + #Below are entirely used for telemetry + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + Region = "$TELEMETRY_AKS_REGION" + ClusterName = "$TELEMETRY_CLUSTER_NAME" + ClusterType = "$TELEMETRY_CLUSTER_TYPE" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" + + hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "60s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25226" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["telegraf_telemetry"] + tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +[[outputs.application_insights]] + ## Instrumentation key of the Application Insights resource. + instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + + ## Timeout for closing (default: 5s). + # timeout = "5s" + + ## Enable additional diagnostic logging. + # enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + namepass = ["telegraf_telemetry"] + #tagdrop = ["nodeName"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Perform string processing on tags, fields, and measurements +#[[processors.rename]] + #[[processors.rename.replace]] + # measurement = "disk" + # dest = "nodes" +# [[processors.rename.replace]] +# field = "free" +# dest = "freeBytes" +# [[processors.rename.replace]] +# field = "used" +# dest = "usedBytes" +# [[processors.rename.replace]] +# field = "used_percent" +# dest = "usedPercentage" + #[[processors.rename.replace]] + # measurement = "net" + # dest = "nodes" + #[[processors.rename.replace]] + # field = "bytes_recv" + # dest = "networkBytesReceivedTotal" + #[[processors.rename.replace]] + # field = "bytes_sent" + # dest = "networkBytesSentTotal" + #[[processors.rename.replace]] + # field = "err_in" + # dest = "networkErrorsInTotal" + #[[processors.rename.replace]] + # field = "err_out" + # dest = "networkErrorsOutTotal" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_volume" + # dest = "pods" + #[[processors.rename.replace]] + # field = "used_bytes" + # dest = "podVolumeUsedBytes" + #[[processors.rename.replace]] + # field = "available_bytes" + # dest = "podVolumeAvailableBytes" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_network" + # dest = "pods" + #[[processors.rename.replace]] + # field = "tx_errors" + # dest = "podNetworkTxErrorsTotal" + #[[processors.rename.replace]] + # field = "rx_errors" + # dest = "podNetworkRxErrorsTotal" + #[[processors.rename.replace]] + # tag = "volume_name" + # dest = "volumeName" + #[[processors.rename.replace]] + # tag = "pod_name" + # dest = "podName" + #[[processors.rename.replace]] + # measurement = "docker" + # dest = "containers" + #[[processors.rename.replace]] + # measurement = "docker_container_status" + # dest = "containers" + #[[processors.rename.replace]] + # field = "n_containers" + # dest = "numContainers" + #[[processors.rename.replace]] + # field = "n_containers_running" + # dest = "numContainersRunning" + #[[processors.rename.replace]] + # field = "n_containers_stopped" + # dest = "numContainersStopped" + #[[processors.rename.replace]] + # field = "n_containers_paused" + # dest = "numContainersPaused" + #[[processors.rename.replace]] + # field = "n_images" + # dest = "numContainerImages" + +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_agregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurance of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +#[[inputs.cpu]] + ## Whether to report per-cpu stats or not +# percpu = false + ## Whether to report total system cpu stats or not +# totalcpu = true + ## If true, collect raw CPU time metrics. +# collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. +# report_active = true +# fieldpass = ["usage_active","cluster","node","host","device"] +# taginclude = ["cluster","cpu","node"] + + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] + fieldpass = ["free", "used", "used_percent"] + taginclude = ["device","path","hostName"] + # Below due to Bug - https://github.com/influxdata/telegraf/issues/5615 + # ORDER matters here!! - i.e the below should be the LAST modifier + [inputs.disk.tagdrop] + path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers"] + + +# Read metrics about memory usage +#[[inputs.mem]] +# fieldpass = ["used_percent", "cluster", "node","host","device"] +# taginclude = ["cluster","node"] + + +# Read metrics about network interface usage +#[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + # interfaces = ["eth0"] + ## + ## On linux systems telegraf also collects protocol stats. + ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. + ## +# ignore_protocol_stats = true + ## + #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + #fieldpass = ["err_in", "err_out"] + #taginclude = ["interface","nodeName"] + +# Read metrics from the kubernetes kubelet api +#[[inputs.kubernetes]] + ## URL for the kubelet + #url = "http://1.1.1.1:10255" +# url = "http://placeholder_nodeip:10255" + + ## Use bearer token for authorization + # bearer_token = /path/to/bearer/token + + ## Set response_timeout (default 5 seconds) + # response_timeout = "5s" + + ## Optional TLS Config + # tls_ca = /path/to/cafile + # tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] +# taginclude = ["volume_name","nodeName","namespace","pod_name"] +# Read metrics about docker containers +#[[inputs.docker]] + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/host/docker.sock" + + ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) +# gather_services = false + + ## Only collect metrics for these containers, collect all if empty +# container_names = [] + + ## Containers to include and exclude. Globs accepted. + ## Note that an empty array for both will include all containers +# container_name_include = [] +# container_name_exclude = [] + + ## Container states to include and exclude. Globs accepted. + ## When empty only containers in the "running" state will be captured. +# container_state_include = ['*'] + # container_state_exclude = [] + + ## Timeout for docker list, info, and stats commands +# timeout = "5s" + + ## Whether to report for each container per-device blkio (8:0, 8:1...) and + ## network (eth0, eth1, ...) stats or not +# perdevice = true + ## Whether to report for each container total blkio and network stats or not +# total = true + ## Which environment variables should we use as a tag + ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] + + ## docker labels to include and exclude as tags. Globs accepted. + ## Note that an empty array for both will include all labels as tags +# docker_label_include = [] +# docker_label_exclude = [] + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] + #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] +# taginclude = ["nodeName"] +[[inputs.exec]] + ## Commands array + interval = "15m" + commands = [ + "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" + ] + + ## Timeout for each command to complete. + timeout = "15s" + + ## measurement name suffix (for separating different commands) + name_suffix = "_telemetry" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + tagexclude = ["hostName"] + diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 9c4d563f8..996c7501a 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -98,6 +98,8 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root @@ -137,6 +139,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit; 755; root; root;sysdir /opt/td-agent-bit/bin; 755; root; root;sysdir +/etc/telegraf; 755; root; root;sysdir /opt/microsoft/omsagent/plugin/lib; 755; root; root; sysdir /opt/microsoft/omsagent/plugin/lib/application_insights; 755; root; root; sysdir diff --git a/installer/scripts/TelegrafTCPErrorTelemetry.sh b/installer/scripts/TelegrafTCPErrorTelemetry.sh new file mode 100644 index 000000000..637af3969 --- /dev/null +++ b/installer/scripts/TelegrafTCPErrorTelemetry.sh @@ -0,0 +1,3 @@ +#!/bin/sh +countErr=$(grep -iF "socket_writer" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') +echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID} telegrafTCPWriteErrorCountTotal=${countErr}i" \ No newline at end of file diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index a1ca3d6ee..269d16111 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -23,10 +23,31 @@ import ( ) // DataType for Container Log -const DataType = "CONTAINER_LOG_BLOB" +const ContainerLogDataType = "CONTAINER_LOG_BLOB" + +// DataType for Insights metric +const InsightsMetricsDataType = "INSIGHTS_METRICS_BLOB" + +//env varibale which has ResourceId for LA +const ResourceIdEnv = "AKS_RESOURCE_ID" + +//env variable which has ResourceName for NON-AKS +const ResourceNameEnv = "ACS_RESOURCE_NAME" + +// Origin prefix for telegraf Metrics (used as prefix for origin field & prefix for azure monitor specific tags) +const TelegrafMetricOriginPrefix = "container.azm.ms" +// Origin suffix for telegraf Metrics (used as suffix for origin field) +const TelegrafMetricOriginSuffix = "telegraf" +// Namespace prefix for telegraf Metrics (used as prefix for Namespace field) +//const TelegrafMetricNamespacePrefix = "plugin" +// clusterName tag +const TelegrafTagClusterName = "clusterName" +// clusterId tag +const TelegrafTagClusterID = "clusterId" // ContainerLogPluginConfFilePath --> config file path for container log plugin -const ContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" +const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" +const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" // IPName for Container Log const IPName = "Containers" @@ -44,10 +65,12 @@ var ( Computer string // WorkspaceID log analytics workspace id WorkspaceID string - // ResourceID for resource-centric log analytics data + // ResourceID for resource-centric log analytics data ResourceID string // Resource-centric flag (will be true if we determine if above RseourceID is non-empty - default is false) ResourceCentric bool + //ResourceName + ResourceName string ) var ( @@ -92,6 +115,26 @@ type DataItem struct { Computer string `json:"Computer"` } +// telegraf metric DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin +type laTelegrafMetric struct { + // 'golden' fields + Origin string `json:"Origin"` + Namespace string `json:"Namespace"` + Name string `json:"Name"` + Value float64 `json:"Value"` + Tags string `json:"Tags"` + // specific required fields for LA + CollectionTime string `json:"CollectionTime"` //mapped to TimeGenerated + Computer string `json:"Computer"` +} + +// ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point +type InsightsMetricsBlob struct { + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []laTelegrafMetric `json:"DataItems"` +} + // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type ContainerLogBlob struct { DataType string `json:"DataType"` @@ -207,6 +250,174 @@ func updateKubeSystemContainerIDs() { } } +//Azure loganalytics metric values have to be numeric, so string values are dropped +func convert(in interface{}) (float64, bool) { + switch v := in.(type) { + case int64: + return float64(v), true + case uint64: + return float64(v), true + case float64: + return v, true + case bool: + if v { + return float64(1), true + } + return float64(0), true + default: + Log ("returning 0 for %v ", in) + return float64(0), false + } +} + +//Translates telegraf time series to one or more Azure loganalytics metric(s) +func translateTelegrafMetrics(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { + + var laMetrics []*laTelegrafMetric + var tags map[interface{}]interface{} + tags = m["tags"].(map[interface{}]interface{}) + tagMap := make(map[string]string) + for k, v := range tags { + key := fmt.Sprintf("%s",k) + if key == "" { + continue + } + tagMap[key] = fmt.Sprintf("%s",v) + } + + //add azure monitor tags + tagMap[fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafTagClusterID)] = ResourceID + tagMap[fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafTagClusterName)] = ResourceName + + var fieldMap map[interface{}]interface{} + fieldMap = m["fields"].(map[interface{}]interface{}) + + tagJson, err := json.Marshal(&tagMap) + + if err != nil { + return nil, err + } + + for k, v := range fieldMap { + fv, ok := convert(v) + if !ok { + continue + } + i := m["timestamp"].(uint64) + laMetric := laTelegrafMetric{ + Origin: fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafMetricOriginSuffix), + //Namespace: fmt.Sprintf("%s/%s", TelegrafMetricNamespacePrefix, m["name"]), + Namespace: fmt.Sprintf("%s", m["name"]), + Name: fmt.Sprintf("%s",k), + Value: fv, + Tags: fmt.Sprintf("%s", tagJson), + CollectionTime: time.Unix(int64(i),0).Format(time.RFC3339), + Computer: Computer, //this is the collection agent's computer name, not necessarily to which computer the metric applies to + } + + //Log ("la metric:%v", laMetric) + laMetrics = append(laMetrics, &laMetric) + } + return laMetrics, nil +} + +//send metrics from Telegraf to LA. 1) Translate telegraf timeseries to LA metric(s) 2) Send it to LA as 'InsightsMetrics' fixed type +func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int { + var laMetrics []*laTelegrafMetric + + if ( (telegrafRecords== nil) || ! (len(telegrafRecords) > 0) ) { + Log("PostTelegrafMetricsToLA::Error:no timeseries to derive") + return output.FLB_OK + } + + for _, record := range telegrafRecords { + translatedMetrics, err := translateTelegrafMetrics(record) + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when translating telegraf metric to log analytics metric %q", err) + Log(message) + //SendException(message) //This will be too noisy + } + laMetrics = append(laMetrics, translatedMetrics...) + } + + if ( (laMetrics == nil) || !(len(laMetrics) > 0) ) { + Log("PostTelegrafMetricsToLA::Info:no metrics derived from timeseries data") + return output.FLB_OK + } else { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Info:derived %v metrics from %v timeseries", len(laMetrics), len(telegrafRecords)) + Log(message) + } + + var metrics []laTelegrafMetric + var i int + + for i=0; i < len(laMetrics); i++ { + metrics = append(metrics, *laMetrics[i]) + } + + laTelegrafMetrics := InsightsMetricsBlob{ + DataType: InsightsMetricsDataType, + IPName: IPName, + DataItems: metrics} + + jsonBytes, err := json.Marshal(laTelegrafMetrics) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } + + //Post metrics data to LA + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) + + //req.URL.Query().Add("api-version","2016-04-01") + + //set headers + req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) + + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } + + start := time.Now() + resp, err := HTTPClient.Do(req) + elapsed := time.Since(start) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) + Log(message) + SendException(message) + UpdateNumTelegrafMetricsSentTelemetry(0, 1) + return output.FLB_RETRY + } + + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("PostTelegrafMetricsToLA::Error:(retriable) Response Status %v Status Code %v", resp.Status, resp.StatusCode) + } + UpdateNumTelegrafMetricsSentTelemetry(0, 1) + return output.FLB_RETRY + } + + defer resp.Body.Close() + + numMetrics := len(laMetrics) + UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0) + Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) + + return output.FLB_OK +} + +func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int) { + ContainerLogTelemetryMutex.Lock() + TelegrafMetricsSentCount += float64(numMetricsSent) + TelegrafMetricsSendErrorCount += float64(numSendErrors) + ContainerLogTelemetryMutex.Unlock() +} + // PostDataHelper sends data to the OMS endpoint func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { @@ -285,7 +496,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if len(dataItems) > 0 { logEntry := ContainerLogBlob{ - DataType: DataType, + DataType: ContainerLogDataType, IPName: IPName, DataItems: dataItems} @@ -384,14 +595,30 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { log.Fatalln(message) } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] + Log("OMSEndpoint %s", OMSEndpoint) + WorkspaceID = omsadminConf["WORKSPACE_ID"] ResourceID = os.Getenv("customResourceId") + if len(ResourceID) > 0 { + //AKS Scenario ResourceCentric = true - Log("OMS ResourceId=%s",ResourceID) + splitted := strings.Split(ResourceID, "/") + ResourceName = splitted[len(splitted)-1] + Log("ResourceCentric: True") + Log("ResourceID=%s",ResourceID) + Log("ResourceName=%s",ResourceID) + } + + if ResourceCentric == false { + //AKS-Engine/hybrid scenario + ResourceName = os.Getenv(ResourceNameEnv) + ResourceID = ResourceName + Log("ResourceCentric: False") + Log("ResourceID=%s",ResourceID) + Log("ResourceName=%s",ResourceName) } - Log("OMSEndpoint %s", OMSEndpoint) - + // Initialize image,name map refresh ticker containerInventoryRefreshInterval, err := strconv.Atoi(pluginConfig["container_inventory_refresh_interval"]) if err != nil { diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 133e0f039..dccc6774c 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -2,11 +2,13 @@ package main import ( "github.com/fluent/fluent-bit-go/output" + "github.com/Microsoft/ApplicationInsights-Go/appinsights" ) import ( "C" "strings" "unsafe" + "os" ) //export FLBPluginRegister @@ -19,8 +21,14 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { // ctx (context) pointer to fluentbit context (state/ c code) func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") - agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") - InitializePlugin(ContainerLogPluginConfFilePath, agentVersion) + agentVersion := os.Getenv("AGENT_VERSION") + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "replicaset") == 0 { + Log("Using %s for plugin config \n", ReplicaSetContainerLogPluginConfFilePath) + InitializePlugin(ReplicaSetContainerLogPluginConfFilePath, agentVersion) + } else { + Log("Using %s for plugin config \n", DaemonSetContainerLogPluginConfFilePath) + InitializePlugin(DaemonSetContainerLogPluginConfFilePath, agentVersion) + } enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushIntervalSeconds") @@ -51,9 +59,13 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { records = append(records, record) } - incomingTag := C.GoString(tag) - if strings.Contains(strings.ToLower(incomingTag), "oms.container.log.flbplugin") { - return PushToAppInsightsTraces(records) + incomingTag := strings.ToLower(C.GoString(tag)) + if strings.Contains(incomingTag, "oms.container.log.flbplugin") { + return PushToAppInsightsTraces(records, appinsights.Information, incomingTag) + } else if strings.Contains(incomingTag, "oms.container.perf.telegraf") { + return PostTelegrafMetricsToLA(records) + } else if strings.Contains(incomingTag, "oms.container.log.telegraf.err") { + return PushToAppInsightsTraces(records, appinsights.Error, incomingTag) } return PostDataHelper(records) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index a64ca2218..f507e4ab9 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -9,11 +9,12 @@ import ( "time" "github.com/Microsoft/ApplicationInsights-Go/appinsights" + "github.com/Microsoft/ApplicationInsights-Go/appinsights/contracts" "github.com/fluent/fluent-bit-go/output" ) var ( - // FlushedRecordsCount indicates the number of flushed records in the current period + // FlushedRecordsCount indicates the number of flushed log records in the current period FlushedRecordsCount float64 // FlushedRecordsTimeTaken indicates the cumulative time taken to flush the records for the current period FlushedRecordsTimeTaken float64 @@ -27,19 +28,23 @@ var ( TelemetryClient appinsights.TelemetryClient // ContainerLogTelemetryTicker sends telemetry periodically ContainerLogTelemetryTicker *time.Ticker + //Tracks the number of telegraf metrics sent successfully between telemetry ticker periods (uses ContainerLogTelemetryTicker) + TelegrafMetricsSentCount float64 + //Tracks the number of send errors between telemetry ticker periods (uses ContainerLogTelemetryTicker) + TelegrafMetricsSendErrorCount float64 ) const ( clusterTypeACS = "ACS" clusterTypeAKS = "AKS" - controllerTypeDaemonSet = "DaemonSet" - controllerTypeReplicaSet = "ReplicaSet" envAKSResourceID = "AKS_RESOURCE_ID" envACSResourceName = "ACS_RESOURCE_NAME" envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" defaultTelemetryPushIntervalSeconds = 300 @@ -63,9 +68,14 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { for ; true; <-ContainerLogTelemetryTicker.C { SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) elapsed := time.Since(start) + ContainerLogTelemetryMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 logRate := FlushedRecordsCount / float64(elapsed/time.Second) + telegrafMetricsSentCount := TelegrafMetricsSentCount + telegrafMetricsSendErrorCount := TelegrafMetricsSendErrorCount + TelegrafMetricsSentCount = 0.0 + TelegrafMetricsSendErrorCount = 0.0 FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 logLatencyMs := AgentLogProcessingMaxLatencyMs @@ -81,6 +91,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) logLatencyMetric.Properties["Container"] = logLatencyMsContainer TelemetryClient.Track(logLatencyMetric) + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofSendErrorsTelegrafMetrics, telegrafMetricsSendErrorCount)) start = time.Now() } } @@ -129,7 +141,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { CommonProperties = make(map[string]string) CommonProperties["Computer"] = Computer CommonProperties["WorkspaceID"] = WorkspaceID - CommonProperties["ControllerType"] = controllerTypeDaemonSet + CommonProperties["ControllerType"] = os.Getenv("CONTROLLER_TYPE") CommonProperties["AgentVersion"] = agentVersion aksResourceID := os.Getenv(envAKSResourceID) @@ -164,13 +176,15 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } // PushToAppInsightsTraces sends the log lines as trace messages to the configured App Insights Instance -func PushToAppInsightsTraces(records []map[interface{}]interface{}) int { +func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel contracts.SeverityLevel, tag string) int { var logLines []string for _, record := range records { logLines = append(logLines, ToString(record["log"])) } traceEntry := strings.Join(logLines, "\n") - TelemetryClient.TrackTrace(traceEntry, 1) + traceTelemetryItem := appinsights.NewTraceTelemetry(traceEntry, severityLevel) + traceTelemetryItem.Properties["tag"] = tag + TelemetryClient.Track(traceTelemetryItem) return output.FLB_OK } From 8cdf72437b3af7b49e6931602a2f2218deea8fbe Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 17 Apr 2019 19:20:57 -0700 Subject: [PATCH 088/160] Fix telemetry error for telegraf err count metric (#215) --- installer/scripts/TelegrafTCPErrorTelemetry.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/scripts/TelegrafTCPErrorTelemetry.sh b/installer/scripts/TelegrafTCPErrorTelemetry.sh index 637af3969..2bd58b202 100644 --- a/installer/scripts/TelegrafTCPErrorTelemetry.sh +++ b/installer/scripts/TelegrafTCPErrorTelemetry.sh @@ -1,3 +1,3 @@ #!/bin/sh countErr=$(grep -iF "socket_writer" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') -echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID} telegrafTCPWriteErrorCountTotal=${countErr}i" \ No newline at end of file +echo "telegraf,Source=telegrafErrLog telegrafTCPWriteErrorCountTotal=${countErr}i" \ No newline at end of file From 36c8037370bd6b98e36f1e03efdefa8de495d32e Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 30 May 2019 17:01:01 -0700 Subject: [PATCH 089/160] Fix Unscheduled Pod bug, remove excess telemetry (#218) * Fix Unscheduled Pod bug, remove excess telemetry * Send Success Telemetry only once after startup for a node in a cluster for MDM Post * Sending telemetry for successful push to MDM every hour --- source/code/plugin/filter_inventory2mdm.rb | 128 +++++++++++---------- source/code/plugin/out_mdm.rb | 7 +- 2 files changed, 73 insertions(+), 62 deletions(-) diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb index 553c857b7..f98a3224e 100644 --- a/source/code/plugin/filter_inventory2mdm.rb +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -10,11 +10,11 @@ module Fluent class Inventory2MdmFilter < Filter Fluent::Plugin.register_filter('filter_inventory2mdm', self) - + config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log' config_param :custom_metrics_azure_regions, :string - + @@node_count_metric_name = 'nodesCount' @@pod_count_metric_name = 'podCount' @@pod_inventory_tag = 'mdm.kubepodinventory' @@ -23,63 +23,63 @@ class Inventory2MdmFilter < Filter @@node_status_not_ready = 'NotReady' @@node_inventory_custom_metrics_template = ' - { - "time": "%{timestamp}", - "data": { - "baseData": { - "metric": "%{metricName}", - "namespace": "insights.container/nodes", - "dimNames": [ + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/nodes", + "dimNames": [ "status" - ], - "series": [ - { - "dimValues": [ + ], + "series": [ + { + "dimValues": [ "%{statusValue}" - ], + ], "min": %{node_status_count}, - "max": %{node_status_count}, - "sum": %{node_status_count}, + "max": %{node_status_count}, + "sum": %{node_status_count}, "count": 1 - } - ] - } - } + } + ] + } + } }' @@pod_inventory_custom_metrics_template = ' - { - "time": "%{timestamp}", - "data": { - "baseData": { - "metric": "%{metricName}", - "namespace": "insights.container/pods", - "dimNames": [ - "phase", - "Kubernetes namespace", - "node", + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/pods", + "dimNames": [ + "phase", + "Kubernetes namespace", + "node", "controllerName" - ], - "series": [ - { - "dimValues": [ - "%{phaseDimValue}", - "%{namespaceDimValue}", - "%{nodeDimValue}", + ], + "series": [ + { + "dimValues": [ + "%{phaseDimValue}", + "%{namespaceDimValue}", + "%{nodeDimValue}", "%{controllerNameDimValue}" - ], + ], "min": %{podCountMetricValue}, - "max": %{podCountMetricValue}, - "sum": %{podCountMetricValue}, - "count": 1 - } - ] - } - } + "max": %{podCountMetricValue}, + "sum": %{podCountMetricValue}, + "count": 1 + } + ] + } + } }' - + @@pod_phase_values = ['Running', 'Pending', 'Succeeded', 'Failed', 'Unknown'] - + @process_incoming_stream = true def initialize @@ -89,7 +89,7 @@ def initialize def configure(conf) super @log = nil - + if @enable_log @log = Logger.new(@log_path, 1, 5000000) @log.debug {'Starting filter_inventory2mdm plugin'} @@ -105,15 +105,15 @@ def start def shutdown super end - + def process_node_inventory_records(es) timestamp = DateTime.now - + begin node_ready_count = 0 node_not_ready_count = 0 records = [] - + es.each{|time,record| begin timestamp = record['DataItems'][0]['CollectionTime'] @@ -129,15 +129,15 @@ def process_node_inventory_records(es) ready_record = @@node_inventory_custom_metrics_template % { timestamp: timestamp, - metricName: @@node_count_metric_name, + metricName: @@node_count_metric_name, statusValue: @@node_status_ready, node_status_count: node_ready_count } records.push(JSON.parse(ready_record)) - + not_ready_record = @@node_inventory_custom_metrics_template % { timestamp: timestamp, - metricName: @@node_count_metric_name, + metricName: @@node_count_metric_name, statusValue: @@node_status_not_ready, node_status_count: node_not_ready_count } @@ -164,7 +164,7 @@ def process_pod_inventory_records(es) record_count += 1 timestamp = record['DataItems'][0]['CollectionTime'] podUid = record['DataItems'][0]['PodUid'] - + if podUids.key?(podUid) #@log.info "pod with #{podUid} already counted" next @@ -176,6 +176,12 @@ def process_pod_inventory_records(es) podControllerNameDimValue = record['DataItems'][0]['ControllerName'] podNodeDimValue = record['DataItems'][0]['Computer'] + if podNodeDimValue.empty? && podPhaseDimValue.downcase == 'pending' + podNodeDimValue = 'unscheduled' + elsif podNodeDimValue.empty? + podNodeDimValue = 'unknown' + end + # group by distinct dimension values pod_key = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue, podPhaseDimValue].join('~~') @@ -197,7 +203,7 @@ def process_pod_inventory_records(es) pod_count = 1 pod_count_hash[pod_key] = pod_count end - + # Collect all possible combinations of dimension values other than pod phase key_without_phase_dim_value = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue].join('~~') if no_phase_dim_values_hash.key?(key_without_phase_dim_value) @@ -237,9 +243,9 @@ def process_pod_inventory_records(es) timestamp: timestamp, metricName: @@pod_count_metric_name, phaseDimValue: podPhaseDimValue, - namespaceDimValue: podNamespaceDimValue, - nodeDimValue: podNodeDimValue, - controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + nodeDimValue: podNodeDimValue, + controllerNameDimValue: podControllerNameDimValue, podCountMetricValue: value } records.push(JSON.parse(record)) @@ -265,11 +271,11 @@ def filter_stream(tag, es) elsif tag.downcase.start_with?(@@pod_inventory_tag) @log.info 'Processing POD inventory records in filter plugin to send to MDM' filtered_records, time = process_pod_inventory_records(es) - else + else filtered_records = [] end end - filtered_records.each {|filtered_record| + filtered_records.each {|filtered_record| new_es.add(time, filtered_record) if filtered_record } if filtered_records rescue => e diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 351198afe..68c43d5da 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -31,6 +31,7 @@ def initialize @last_post_attempt_time = Time.now @first_post_attempt_made = false @can_send_data_to_mdm = true + @last_telemetry_sent_time = nil end def configure(conf) @@ -156,7 +157,11 @@ def send_to_mdm(post_body) response = @http_client.request(request) response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code}" - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) + if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) + @last_telemetry_sent_time = Time.now + end + rescue Net::HTTPServerException => e @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) From 803f934cba774bf2abf7594a1025bad88c105e5c Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 5 Jun 2019 17:32:16 -0700 Subject: [PATCH 090/160] Merge from Vishwa/promstandardmetrics into ci_feature (#220) * enable prometheus metrics collection in replica-set * fixing typos * fix config file path for replicaset * fix configuration * config changes --- installer/conf/td-agent-bit-rs.conf | 29 ++ installer/conf/td-agent-bit.conf | 5 - installer/conf/telegraf-rs.conf | 567 ++++++++++++++++++++++++ installer/conf/telegraf.conf | 107 ++++- installer/datafiles/base_container.data | 2 + source/code/go/src/plugins/oms.go | 10 +- source/code/go/src/plugins/telemetry.go | 20 +- 7 files changed, 709 insertions(+), 31 deletions(-) create mode 100644 installer/conf/td-agent-bit-rs.conf create mode 100644 installer/conf/telegraf-rs.conf diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf new file mode 100644 index 000000000..740f8a951 --- /dev/null +++ b/installer/conf/td-agent-bit-rs.conf @@ -0,0 +1,29 @@ +[SERVICE] + Flush 30 + Log_Level info + Parsers_File /etc/td-agent-bit/parsers.conf + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log + +[INPUT] + Name tail + Tag oms.container.log.telegraf.err.* + Path /var/opt/microsoft/docker-cimprov/log/telegraf.log + DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db + Mem_Buf_Limit 2m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 5m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25226 + Chunk_Size 32 + Buffer_Size 64 + +[OUTPUT] + Name oms + EnableTelemetry true + TelemetryPushIntervalSeconds 300 + Match oms.container.* diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 88bacaca2..50967e61f 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -43,11 +43,6 @@ Chunk_Size 32 Buffer_Size 64 -[FILTER] - Name grep - Match oms.container.log.telegraf.err.* - #Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ - [OUTPUT] Name oms EnableTelemetry true diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf new file mode 100644 index 000000000..cb9a36685 --- /dev/null +++ b/installer/conf/telegraf-rs.conf @@ -0,0 +1,567 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + #Below are entirely used for telemetry + #AgentVersion = "$AGENT_VERSION" + #AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + #ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + #Region = "$TELEMETRY_AKS_REGION" + #ClusterName = "$TELEMETRY_CLUSTER_NAME" + #ClusterType = "$TELEMETRY_CLUSTER_TYPE" + #Computer = "placeholder_hostname" + #ControllerType = "$CONTROLLER_TYPE" + + #hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "60s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25226" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["telegraf_telemetry"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +[[outputs.application_insights]] + ## Instrumentation key of the Application Insights resource. + instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + + ## Timeout for closing (default: 5s). + # timeout = "5s" + + ## Enable additional diagnostic logging. + # enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + namepass = ["telegraf_telemetry"] + #tagdrop = ["nodeName"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Perform string processing on tags, fields, and measurements +#[[processors.rename]] + #[[processors.rename.replace]] + # measurement = "disk" + # dest = "nodes" +# [[processors.rename.replace]] +# field = "free" +# dest = "freeBytes" +# [[processors.rename.replace]] +# field = "used" +# dest = "usedBytes" +# [[processors.rename.replace]] +# field = "used_percent" +# dest = "usedPercentage" + #[[processors.rename.replace]] + # measurement = "net" + # dest = "nodes" + #[[processors.rename.replace]] + # field = "bytes_recv" + # dest = "networkBytesReceivedTotal" + #[[processors.rename.replace]] + # field = "bytes_sent" + # dest = "networkBytesSentTotal" + #[[processors.rename.replace]] + # field = "err_in" + # dest = "networkErrorsInTotal" + #[[processors.rename.replace]] + # field = "err_out" + # dest = "networkErrorsOutTotal" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_volume" + # dest = "pods" + #[[processors.rename.replace]] + # field = "used_bytes" + # dest = "podVolumeUsedBytes" + #[[processors.rename.replace]] + # field = "available_bytes" + # dest = "podVolumeAvailableBytes" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_network" + # dest = "pods" + #[[processors.rename.replace]] + # field = "tx_errors" + # dest = "podNetworkTxErrorsTotal" + #[[processors.rename.replace]] + # field = "rx_errors" + # dest = "podNetworkRxErrorsTotal" + #[[processors.rename.replace]] + # tag = "volume_name" + # dest = "volumeName" + #[[processors.rename.replace]] + # tag = "pod_name" + # dest = "podName" + #[[processors.rename.replace]] + # measurement = "docker" + # dest = "containers" + #[[processors.rename.replace]] + # measurement = "docker_container_status" + # dest = "containers" + #[[processors.rename.replace]] + # field = "n_containers" + # dest = "numContainers" + #[[processors.rename.replace]] + # field = "n_containers_running" + # dest = "numContainersRunning" + #[[processors.rename.replace]] + # field = "n_containers_stopped" + # dest = "numContainersStopped" + #[[processors.rename.replace]] + # field = "n_containers_paused" + # dest = "numContainersPaused" + #[[processors.rename.replace]] + # field = "n_images" + # dest = "numContainerImages" + +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_agregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurance of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +#[[inputs.cpu]] + ## Whether to report per-cpu stats or not +# percpu = false + ## Whether to report total system cpu stats or not +# totalcpu = true + ## If true, collect raw CPU time metrics. +# collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. +# report_active = true +# fieldpass = ["usage_active","cluster","node","host","device"] +# taginclude = ["cluster","cpu","node"] + + + +# Read metrics about disk usage by mount point +#[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. +# ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] +# fieldpass = ["free", "used", "used_percent"] +# taginclude = ["device","path","hostName"] + # Below due to Bug - https://github.com/influxdata/telegraf/issues/5615 + # ORDER matters here!! - i.e the below should be the LAST modifier +# [inputs.disk.tagdrop] +# path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers"] + + +# Read metrics about memory usage +#[[inputs.mem]] +# fieldpass = ["used_percent", "cluster", "node","host","device"] +# taginclude = ["cluster","node"] + + +# Read metrics about network interface usage +#[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + # interfaces = ["eth0"] + ## + ## On linux systems telegraf also collects protocol stats. + ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. + ## +# ignore_protocol_stats = true + ## + #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + #fieldpass = ["err_in", "err_out"] + #taginclude = ["interface","nodeName"] + +# Read metrics from the kubernetes kubelet api +#[[inputs.kubernetes]] + ## URL for the kubelet + #url = "http://1.1.1.1:10255" +# url = "http://placeholder_nodeip:10255" + + ## Use bearer token for authorization + # bearer_token = /path/to/bearer/token + + ## Set response_timeout (default 5 seconds) + # response_timeout = "5s" + + ## Optional TLS Config + # tls_ca = /path/to/cafile + # tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] +# taginclude = ["volume_name","nodeName","namespace","pod_name"] +# Read metrics about docker containers +#[[inputs.docker]] + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/host/docker.sock" + + ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) +# gather_services = false + + ## Only collect metrics for these containers, collect all if empty +# container_names = [] + + ## Containers to include and exclude. Globs accepted. + ## Note that an empty array for both will include all containers +# container_name_include = [] +# container_name_exclude = [] + + ## Container states to include and exclude. Globs accepted. + ## When empty only containers in the "running" state will be captured. +# container_state_include = ['*'] + # container_state_exclude = [] + + ## Timeout for docker list, info, and stats commands +# timeout = "5s" + + ## Whether to report for each container per-device blkio (8:0, 8:1...) and + ## network (eth0, eth1, ...) stats or not +# perdevice = true + ## Whether to report for each container total blkio and network stats or not +# total = true + ## Which environment variables should we use as a tag + ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] + + ## docker labels to include and exclude as tags. Globs accepted. + ## Note that an empty array for both will include all labels as tags +# docker_label_include = [] +# docker_label_exclude = [] + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] + #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] +# taginclude = ["nodeName"] +#[[inputs.prometheus]] + ## An array of urls to scrape metrics from. +# urls = ["https://$KUBERNETES_SERVICE_HOST:$KUBERNETES_SERVICE_PORT/metrics"] +# fieldpass = ["apiserver_request_count"] + +# metric_version = 2 +# url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) +# bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) +# response_timeout = "15s" + + ## Optional TLS Config +# tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification +# insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] +# [inputs.prometheus.tagpass] + +[[inputs.exec]] + ## Commands array + interval = "15m" + commands = [ + "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" + ] + + ## Timeout for each command to complete. + timeout = "15s" + + ## measurement name suffix (for separating different commands) + name_suffix = "_telemetry" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + #tagexclude = ["hostName"] + [inputs.exec.tags] + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + Region = "$TELEMETRY_AKS_REGION" + ClusterName = "$TELEMETRY_CLUSTER_NAME" + ClusterType = "$TELEMETRY_CLUSTER_TYPE" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" + diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 355c88b3d..e7c0d6509 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -17,14 +17,14 @@ # Global tags can be specified here in key="value" format. [global_tags] #Below are entirely used for telemetry - AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" - ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" - Region = "$TELEMETRY_AKS_REGION" - ClusterName = "$TELEMETRY_CLUSTER_NAME" - ClusterType = "$TELEMETRY_CLUSTER_TYPE" - Computer = "placeholder_hostname" - ControllerType = "$CONTROLLER_TYPE" + #AgentVersion = "$AGENT_VERSION" + #AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + #ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + #Region = "$TELEMETRY_AKS_REGION" + #ClusterName = "$TELEMETRY_CLUSTER_NAME" + #ClusterType = "$TELEMETRY_CLUSTER_TYPE" + #Computer = "placeholder_hostname" + #ControllerType = "$CONTROLLER_TYPE" hostName = "placeholder_hostname" @@ -122,7 +122,7 @@ ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "json" namedrop = ["telegraf_telemetry"] - tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] [[outputs.application_insights]] ## Instrumentation key of the Application Insights resource. @@ -392,6 +392,7 @@ # Read metrics about disk usage by mount point [[inputs.disk]] + name_prefix="container.azm.ms/" ## By default stats will be gathered for all mount points. ## Set mount_points will restrict the stats to only the specified mount points. # mount_points = ["/"] @@ -411,9 +412,40 @@ # fieldpass = ["used_percent", "cluster", "node","host","device"] # taginclude = ["cluster","node"] +# Read metrics about disk IO by device +[[inputs.diskio]] + name_prefix="container.azm.ms/" + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + devices = ["sd[a-z][0-9]"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + # + ## On systems which support it, device metadata can be added in the form of + ## tags. + ## Currently only Linux is supported via udev properties. You can view + ## available properties for a device by running: + ## 'udevadm info -q property -n /dev/sda' + ## Note: Most, but not all, udev properties can be accessed this way. Properties + ## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH. + # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] + # + ## Using the same metadata source as device_tags, you can also customize the + ## name of the device via templates. + ## The 'name_templates' parameter is a list of templates to try and apply to + ## the device. The template may contain variables in the form of '$PROPERTY' or + ## '${PROPERTY}'. The first template which does not contain any variables not + ## present for the device is used as the device name tag. + ## The typical use case is for LVM volumes, to get the VG/LV name instead of + ## the near-meaningless DM-0 name. + # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] + fieldpass = ["reads", "read_bytes", "read_time", "writes", "write_bytes", "write_time", "io_time", "iops_in_progress"] + taginclude = ["name","hostName"] # Read metrics about network interface usage -#[[inputs.net]] +[[inputs.net]] + name_prefix="container.azm.ms/" ## By default, telegraf gathers stats from any up interface (excluding loopback) ## Setting interfaces will tell it to gather these explicit interfaces, ## regardless of status. @@ -423,11 +455,10 @@ ## On linux systems telegraf also collects protocol stats. ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. ## -# ignore_protocol_stats = true + ignore_protocol_stats = true ## - #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] - #fieldpass = ["err_in", "err_out"] - #taginclude = ["interface","nodeName"] + fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + taginclude = ["interface","hostName"] # Read metrics from the kubernetes kubelet api #[[inputs.kubernetes]] @@ -497,6 +528,45 @@ # fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] # taginclude = ["nodeName"] +[[inputs.prometheus]] + name_prefix="container.azm.ms/" + ## An array of urls to scrape metrics from. + urls = ["http://$NODE_IP:10255/metrics"] + fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] + + metric_version = 2 + url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + [[inputs.exec]] ## Commands array interval = "15m" @@ -516,4 +586,13 @@ ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "influx" tagexclude = ["hostName"] + [inputs.exec.tags] + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + Region = "$TELEMETRY_AKS_REGION" + ClusterName = "$TELEMETRY_CLUSTER_NAME" + ClusterType = "$TELEMETRY_CLUSTER_TYPE" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 996c7501a..234785b64 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -97,8 +97,10 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; installer/conf/td-agent-bit-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root %Links diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 269d16111..166f427be 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -47,7 +47,7 @@ const TelegrafTagClusterID = "clusterId" // ContainerLogPluginConfFilePath --> config file path for container log plugin const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" -const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" +const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" // IPName for Container Log const IPName = "Containers" @@ -680,6 +680,10 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { PluginConfiguration = pluginConfig CreateHTTPClient() - go updateKubeSystemContainerIDs() - go updateContainerImageNameMaps() + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + go updateKubeSystemContainerIDs() + go updateContainerImageNameMaps() + } else { + Log("Running in replicaset. Disabling kube-system container cache collection & updates \n") + } } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index f507e4ab9..1e3d73fcd 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -66,9 +66,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { SendEvent(eventNameContainerLogInit, make(map[string]string)) for ; true; <-ContainerLogTelemetryTicker.C { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) elapsed := time.Since(start) - ContainerLogTelemetryMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 logRate := FlushedRecordsCount / float64(elapsed/time.Second) @@ -84,13 +82,17 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { AgentLogProcessingMaxLatencyMsContainer = "" ContainerLogTelemetryMutex.Unlock() - flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(flushRateMetric) - logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) - TelemetryClient.Track(logRateMetric) - logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) - logLatencyMetric.Properties["Container"] = logLatencyMsContainer - TelemetryClient.Track(logLatencyMetric) + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + TelemetryClient.Track(logRateMetric) + logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) + logLatencyMetric.Properties["Container"] = logLatencyMsContainer + TelemetryClient.Track(logLatencyMetric) + } + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofSendErrorsTelegrafMetrics, telegrafMetricsSendErrorCount)) start = time.Now() From afc66b7dcb2a3743bfb507f5a2cc8241d6b51e2b Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 6 Jun 2019 16:32:13 -0700 Subject: [PATCH 091/160] merge config/settings to ci_feature (#221) * updating fluentbit to use LOG_TAIL_PATH * changes * log exclusion pattern * changes * removing comments * adding enviornment varibale collection/disable * disable env var for cluster variable change * changes * toml parser changes * adding directory tomlrb * changes for container inventory * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * Telemetry for config overrides * add schema version telemetry * reduce the number of api calls for namespace filtering add more telemetry for config processing move liveness probe & parser to this repo * optimize for default kube-system namespace log collection exclusion --- installer/conf/out_oms.conf | 3 +- installer/conf/td-agent-bit.conf | 9 +- installer/datafiles/base_container.data | 13 + installer/scripts/livenessprobe.sh | 20 + installer/scripts/tomlparser.rb | 152 +++++ source/code/go/src/plugins/oms.go | 233 +++++--- source/code/go/src/plugins/out_oms.go | 2 +- .../code/plugin/CAdvisorMetricsAPIClient.rb | 17 + source/code/plugin/in_containerinventory.rb | 51 +- source/code/plugin/in_kube_podinventory.rb | 49 +- source/code/toml-parser/tomlrb.rb | 44 ++ .../toml-parser/tomlrb/generated_parser.rb | 542 ++++++++++++++++++ source/code/toml-parser/tomlrb/handler.rb | 73 +++ source/code/toml-parser/tomlrb/parser.rb | 18 + source/code/toml-parser/tomlrb/parser.y | 104 ++++ source/code/toml-parser/tomlrb/scanner.rb | 54 ++ .../code/toml-parser/tomlrb/string_utils.rb | 33 ++ source/code/toml-parser/tomlrb/version.rb | 3 + 18 files changed, 1288 insertions(+), 132 deletions(-) create mode 100644 installer/scripts/livenessprobe.sh create mode 100644 installer/scripts/tomlparser.rb create mode 100644 source/code/toml-parser/tomlrb.rb create mode 100644 source/code/toml-parser/tomlrb/generated_parser.rb create mode 100644 source/code/toml-parser/tomlrb/handler.rb create mode 100644 source/code/toml-parser/tomlrb/parser.rb create mode 100644 source/code/toml-parser/tomlrb/parser.y create mode 100644 source/code/toml-parser/tomlrb/scanner.rb create mode 100644 source/code/toml-parser/tomlrb/string_utils.rb create mode 100644 source/code/toml-parser/tomlrb/version.rb diff --git a/installer/conf/out_oms.conf b/installer/conf/out_oms.conf index d4b797757..d6679f982 100644 --- a/installer/conf/out_oms.conf +++ b/installer/conf/out_oms.conf @@ -3,4 +3,5 @@ cert_file_path=/etc/opt/microsoft/omsagent/certs/oms.crt key_file_path=/etc/opt/microsoft/omsagent/certs/oms.key container_host_file_path=/var/opt/microsoft/docker-cimprov/state/containerhostname container_inventory_refresh_interval=60 -kube_system_containers_refresh_interval=300 +#kube_system_containers_refresh_interval=300 +exclude_namespaces_containers_refresh_interval=60 diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 50967e61f..d1a045063 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -7,13 +7,14 @@ [INPUT] Name tail Tag oms.container.log.* - Path /var/log/containers/*.log + Path ${AZMON_LOG_TAIL_PATH} DB /var/log/omsagent-fblogs.db Parser docker Mem_Buf_Limit 5m Path_Key filepath Skip_Long_Lines On Ignore_Older 5m + Exclude_Path ${AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH} [INPUT] Name tail @@ -43,6 +44,12 @@ Chunk_Size 32 Buffer_Size 64 +# Enable/Disable stdout stderr logs using configmap +[FILTER] + Name grep + Match oms.container.log.* + Exclude stream ${AZMON_LOG_EXCLUSION_REGEX_PATTERN} + [OUTPUT] Name oms EnableTelemetry true diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 234785b64..fd070426c 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -95,6 +95,15 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/lib/application_insights/channel/event.rb; source/code/plugin/lib/application_insights/channel/event.rb; 644; root; root /opt/microsoft/omsagent/plugin/lib/application_insights.rb; source/code/plugin/lib/application_insights.rb; 644; root; root +/opt/tomlrb.rb; source/code/toml-parser/tomlrb.rb; 644; root; root +/opt/tomlrb/generated_parser.rb; source/code/toml-parser/tomlrb/generated_parser.rb; 644; root; root +/opt/tomlrb/handler.rb; source/code/toml-parser/tomlrb/handler.rb; 644; root; root +/opt/tomlrb/parser.rb; source/code/toml-parser/tomlrb/parser.rb; 644; root; root +/opt/tomlrb/parser.y; source/code/toml-parser/tomlrb/parser.y; 644; root; root +/opt/tomlrb/scanner.rb; source/code/toml-parser/tomlrb/scanner.rb; 644; root; root +/opt/tomlrb/string_utils.rb; source/code/toml-parser/tomlrb/string_utils.rb; 644; root; root +/opt/tomlrb/version.rb; source/code/toml-parser/tomlrb/version.rb; 644; root; root + /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; installer/conf/td-agent-bit-rs.conf; 644; root; root @@ -102,6 +111,8 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root +/opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root +/opt/tomlparser.rb; installer/scripts/tomlparser.rb 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root @@ -149,6 +160,8 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir /opt/microsoft/omsagent/plugin/lib/application_insights/rack; 755; root; root; sysdir +/opt/tomlrb; 755; root; root; sysdir + %Dependencies %Postinstall_10 diff --git a/installer/scripts/livenessprobe.sh b/installer/scripts/livenessprobe.sh new file mode 100644 index 000000000..cb7e8a0ba --- /dev/null +++ b/installer/scripts/livenessprobe.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#test to exit non zero value +(ps -ef | grep omsagent | grep -v "grep") && (ps -ef | grep td-agent-bit | grep -v "grep") +if [ $? -eq 0 ] && [ ! -s "inotifyoutput.txt" ] +then + # inotifyoutput file is empty and the grep commands for omsagent and td-agent-bit succeeded + exit 0 +else + if [ -s "inotifyoutput.txt" ] + then + # inotifyoutput file has data(config map was applied) + echo "config changed" > /dev/termination-log + exit 1 + else + # grep commands for omsagent or td-agent-bit failed + echo "agent or fluentbit not running" > /dev/termination-log + exit 1 + fi +fi \ No newline at end of file diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb new file mode 100644 index 000000000..52516641a --- /dev/null +++ b/installer/scripts/tomlparser.rb @@ -0,0 +1,152 @@ +#!/usr/local/bin/ruby + +require_relative "tomlrb" + +@configMapMountPath = "/etc/config/settings/log-data-collection-settings" +@configVersion = "" +@configSchemaVersion = "" +# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist +@collectStdoutLogs = true +@stdoutExcludeNamespaces = "kube-system" +@collectStderrLogs = true +@stderrExcludeNamespaces = "kube-system" +@collectClusterEnvVariables = true +@logTailPath = "/var/log/containers/*.log" +@logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" +@excludePath = "*.csv2" + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults" + @excludePath = "*_kube-system_*.log" + return nil + end + rescue => errorStr + puts "config::error::Exception while parsing toml config file: #{errorStr}, using defaults" + @excludePath = "*_kube-system_*.log" + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + if !parsedConfig.nil? && !parsedConfig[:log_collection_settings].nil? + #Get stdout log config settings + begin + if !parsedConfig[:log_collection_settings][:stdout].nil? && !parsedConfig[:log_collection_settings][:stdout][:enabled].nil? + @collectStdoutLogs = parsedConfig[:log_collection_settings][:stdout][:enabled] + puts "config::Using config map setting for stdout log collection" + stdoutNamespaces = parsedConfig[:log_collection_settings][:stdout][:exclude_namespaces] + + #Clearing it, so that it can be overridden with the config map settings + @stdoutExcludeNamespaces.clear + if @collectStdoutLogs && !stdoutNamespaces.nil? + if stdoutNamespaces.kind_of?(Array) + # Checking only for the first element to be string because toml enforces the arrays to contain elements of same type + if stdoutNamespaces.length > 0 && stdoutNamespaces[0].kind_of?(String) + #Empty the array to use the values from configmap + stdoutNamespaces.each do |namespace| + if @stdoutExcludeNamespaces.empty? + # To not append , for the first element + @stdoutExcludeNamespaces.concat(namespace) + else + @stdoutExcludeNamespaces.concat("," + namespace) + end + end + puts "config::Using config map setting for stdout log collection to exclude namespace" + end + end + end + end + rescue => errorStr + puts "config::error::Exception while reading config settings for stdout log collection - #{errorStr}, using defaults" + end + + #Get stderr log config settings + begin + if !parsedConfig[:log_collection_settings][:stderr].nil? && !parsedConfig[:log_collection_settings][:stderr][:enabled].nil? + @collectStderrLogs = parsedConfig[:log_collection_settings][:stderr][:enabled] + puts "config::Using config map setting for stderr log collection" + stderrNamespaces = parsedConfig[:log_collection_settings][:stderr][:exclude_namespaces] + + #Clearing it, so that it can be overridden with the config map settings + @stderrExcludeNamespaces.clear + if @collectStderrLogs && !stderrNamespaces.nil? + if stderrNamespaces.kind_of?(Array) + # Checking only for the first element to be string because toml enforces the arrays to contain elements of same type + if stderrNamespaces.length > 0 && stderrNamespaces[0].kind_of?(String) + stderrNamespaces.each do |namespace| + if @stderrExcludeNamespaces.empty? + # To not append , for the first element + @stderrExcludeNamespaces.concat(namespace) + else + @stderrExcludeNamespaces.concat("," + namespace) + end + end + puts "config::Using config map setting for stderr log collection to exclude namespace" + end + end + end + end + rescue => errorStr + puts "config::error:Exception while reading config settings for stderr log collection - #{errorStr}, using defaults" + end + + #Get environment variables log config settings + begin + if !parsedConfig[:log_collection_settings][:env_var].nil? && !parsedConfig[:log_collection_settings][:env_var][:enabled].nil? + @collectClusterEnvVariables = parsedConfig[:log_collection_settings][:env_var][:enabled] + puts "config::Using config map setting for cluster level environment variable collection" + end + rescue => errorStr + puts "config::error::Exception while reading config settings for cluster level environment variable collection - #{errorStr}, using defaults" + end + end +end + + @configSchemaVersion = ENV['AZMON_AGENT_CFG_SCHEMA_VERSION'] + if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @@configSchemaVersion.strip.casecmp('v1') == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end + else + puts "config::unsupported config schema version - #{@configSchemaVersion}, using defaults" + @excludePath = "*_kube-system_*.log" + end + + # Write the settings to file, so that they can be set as environment variables + file = File.open("config_env_var", "w") + + if !file.nil? + # This will be used in td-agent-bit.conf file to filter out logs + if (!@collectStdoutLogs && !@collectStderrLogs) + #Stop log tailing completely + @logTailPath = "/opt/nolog*.log" + @logExclusionRegexPattern = "stdout|stderr" + elsif !@collectStdoutLogs + @logExclusionRegexPattern = "stdout" + elsif !@collectStderrLogs + @logExclusionRegexPattern = "stderr" + end + file.write("export AZMON_COLLECT_STDOUT_LOGS=#{@collectStdoutLogs}\n") + file.write("export AZMON_LOG_TAIL_PATH=#{@logTailPath}\n") + file.write("export AZMON_LOG_EXCLUSION_REGEX_PATTERN=\"#{@logExclusionRegexPattern}\"\n") + file.write("export AZMON_STDOUT_EXCLUDED_NAMESPACES=#{@stdoutExcludeNamespaces}\n") + file.write("export AZMON_COLLECT_STDERR_LOGS=#{@collectStderrLogs}\n") + file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") + file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") + file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") + # Close file after writing all environment variables + file.close + else + puts "config::error::Exception while opening file for writing config environment variables" + end diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 166f427be..0ffaaff63 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -17,6 +17,7 @@ import ( lumberjack "gopkg.in/natefinch/lumberjack.v2" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -36,12 +37,15 @@ const ResourceNameEnv = "ACS_RESOURCE_NAME" // Origin prefix for telegraf Metrics (used as prefix for origin field & prefix for azure monitor specific tags) const TelegrafMetricOriginPrefix = "container.azm.ms" + // Origin suffix for telegraf Metrics (used as suffix for origin field) const TelegrafMetricOriginSuffix = "telegraf" + // Namespace prefix for telegraf Metrics (used as prefix for Namespace field) //const TelegrafMetricNamespacePrefix = "plugin" // clusterName tag const TelegrafTagClusterName = "clusterName" + // clusterId tag const TelegrafTagClusterID = "clusterId" @@ -52,7 +56,9 @@ const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimp // IPName for Container Log const IPName = "Containers" const defaultContainerInventoryRefreshInterval = 60 -const defaultKubeSystemContainersRefreshInterval = 300 + +// const defaultKubeSystemContainersRefreshInterval = 300 +const defaultExcludeNamespacesContainersRefreshInterval = 300 var ( // PluginConfiguration the plugins configuration @@ -65,11 +71,11 @@ var ( Computer string // WorkspaceID log analytics workspace id WorkspaceID string - // ResourceID for resource-centric log analytics data + // ResourceID for resource-centric log analytics data ResourceID string // Resource-centric flag (will be true if we determine if above RseourceID is non-empty - default is false) ResourceCentric bool - //ResourceName + //ResourceName ResourceName string ) @@ -78,8 +84,10 @@ var ( ImageIDMap map[string]string // NameIDMap caches the container it to Name mapping NameIDMap map[string]string - // IgnoreIDSet set of container Ids of kube-system pods - IgnoreIDSet map[string]bool + // StdoutIgnoreIDSet set of container Ids of excluded namespaces for stdout logs + StdoutIgnoreIDSet map[string]bool + // StderrIgnoreIDSet set of container Ids of excluded namespaces for stderr logs + StderrIgnoreIDSet map[string]bool // DataUpdateMutex read and write mutex access to the container id set DataUpdateMutex = &sync.Mutex{} // ContainerLogTelemetryMutex read and write mutex access to the Container Log Telemetry @@ -89,8 +97,8 @@ var ( ) var ( - // KubeSystemContainersRefreshTicker updates the kube-system containers - KubeSystemContainersRefreshTicker *time.Ticker + // ExcludeNamespacesContainersRefreshTicker updates the excludenamespace containers + ExcludeNamespacesContainersRefreshTicker *time.Ticker // ContainerImageNameRefreshTicker updates the container image and names periodically ContainerImageNameRefreshTicker *time.Ticker ) @@ -118,21 +126,21 @@ type DataItem struct { // telegraf metric DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin type laTelegrafMetric struct { // 'golden' fields - Origin string `json:"Origin"` - Namespace string `json:"Namespace"` - Name string `json:"Name"` - Value float64 `json:"Value"` - Tags string `json:"Tags"` + Origin string `json:"Origin"` + Namespace string `json:"Namespace"` + Name string `json:"Name"` + Value float64 `json:"Value"` + Tags string `json:"Tags"` // specific required fields for LA - CollectionTime string `json:"CollectionTime"` //mapped to TimeGenerated - Computer string `json:"Computer"` + CollectionTime string `json:"CollectionTime"` //mapped to TimeGenerated + Computer string `json:"Computer"` } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type InsightsMetricsBlob struct { - DataType string `json:"DataType"` - IPName string `json:"IPName"` - DataItems []laTelegrafMetric `json:"DataItems"` + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []laTelegrafMetric `json:"DataItems"` } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point @@ -187,7 +195,7 @@ func updateContainerImageNameMaps() { listOptions := metav1.ListOptions{} listOptions.FieldSelector = fmt.Sprintf("spec.nodeName=%s", Computer) pods, err := ClientSet.CoreV1().Pods("").List(listOptions) - + if err != nil { message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) Log(message) @@ -217,36 +225,71 @@ func updateContainerImageNameMaps() { } } -func updateKubeSystemContainerIDs() { - for ; true; <-KubeSystemContainersRefreshTicker.C { - if strings.Compare(os.Getenv("DISABLE_KUBE_SYSTEM_LOG_COLLECTION"), "true") != 0 { - Log("Kube System Log Collection is ENABLED.") - return +func excludeContainerIDPopulator(excludeNamespaceList []string, logStream string) { + var podsToExclude []*corev1.PodList + listOptions := metav1.ListOptions{} + listOptions.FieldSelector = fmt.Sprintf("spec.nodeName=%s", Computer) + + pods, err := ClientSet.CoreV1().Pods("").List(listOptions) + if err != nil { + message := fmt.Sprintf("Error getting pods %s - for node %s . All %s logs might be collected", err.Error(), Computer, logStream) + SendException(message) + Log(message) + return + } + + podsToExclude = append(podsToExclude, pods) + ignoreNamespaceSet := make(map[string]bool) + for _, ns := range excludeNamespaceList { + ignoreNamespaceSet[strings.TrimSpace(ns)] = true + } + + _ignoreIDSet := make(map[string]bool) + for _, pod := range podsToExclude { + for _, pod := range pod.Items { + _, ok := ignoreNamespaceSet[pod.Namespace] + if ok { + Log ("Adding pod %s in namespace %s to %s exclusion list", pod.Name, pod.Namespace, logStream) + for _, status := range pod.Status.ContainerStatuses { + lastSlashIndex := strings.LastIndex(status.ContainerID, "/") + _ignoreIDSet[status.ContainerID[lastSlashIndex+1:len(status.ContainerID)]] = true + } + } } + } - Log("Kube System Log Collection is DISABLED. Collecting containerIds to drop their records") + Log("Locking to update excluded container IDs for %s", logStream) + DataUpdateMutex.Lock() + if strings.Compare(logStream, "stdout") == 0 { + StdoutIgnoreIDSet = _ignoreIDSet + } else { + StderrIgnoreIDSet = _ignoreIDSet + } + DataUpdateMutex.Unlock() + Log("Unlocking after updating excluded container IDs for %s", logStream) +} - pods, err := ClientSet.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) - if err != nil { - message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue. Kube-system logs will be collected", err.Error()) - SendException(message) - Log(message) - continue +func updateExcludeStdoutContainerIDs() { + for ; true; <-ExcludeNamespacesContainersRefreshTicker.C { + collectStdoutLogs := os.Getenv("AZMON_COLLECT_STDOUT_LOGS") + var stdoutNSExcludeList []string + excludeList := os.Getenv("AZMON_STDOUT_EXCLUDED_NAMESPACES") + if (strings.Compare(collectStdoutLogs, "true") == 0) && (len(excludeList) > 0) { + stdoutNSExcludeList = strings.Split(excludeList, ",") + excludeContainerIDPopulator(stdoutNSExcludeList, "stdout") } + } +} - _ignoreIDSet := make(map[string]bool) - for _, pod := range pods.Items { - for _, status := range pod.Status.ContainerStatuses { - lastSlashIndex := strings.LastIndex(status.ContainerID, "/") - _ignoreIDSet[status.ContainerID[lastSlashIndex+1:len(status.ContainerID)]] = true - } +func updateExcludeStderrContainerIDs() { + for ; true; <-ExcludeNamespacesContainersRefreshTicker.C { + collectStderrLogs := os.Getenv("AZMON_COLLECT_STDERR_LOGS") + var stderrNSExcludeList []string + excludeList := os.Getenv("AZMON_STDERR_EXCLUDED_NAMESPACES") + if (strings.Compare(collectStderrLogs, "true") == 0) && (len(excludeList) > 0) { + stderrNSExcludeList = strings.Split(excludeList, ",") + excludeContainerIDPopulator(stderrNSExcludeList, "stderr") } - - Log("Locking to update kube-system container IDs") - DataUpdateMutex.Lock() - IgnoreIDSet = _ignoreIDSet - DataUpdateMutex.Unlock() - Log("Unlocking after updating kube-system container IDs") } } @@ -265,24 +308,24 @@ func convert(in interface{}) (float64, bool) { } return float64(0), true default: - Log ("returning 0 for %v ", in) + Log("returning 0 for %v ", in) return float64(0), false } } //Translates telegraf time series to one or more Azure loganalytics metric(s) func translateTelegrafMetrics(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { - + var laMetrics []*laTelegrafMetric var tags map[interface{}]interface{} tags = m["tags"].(map[interface{}]interface{}) tagMap := make(map[string]string) for k, v := range tags { - key := fmt.Sprintf("%s",k) + key := fmt.Sprintf("%s", k) if key == "" { continue } - tagMap[key] = fmt.Sprintf("%s",v) + tagMap[key] = fmt.Sprintf("%s", v) } //add azure monitor tags @@ -305,14 +348,14 @@ func translateTelegrafMetrics(m map[interface{}]interface{}) ([]*laTelegrafMetri } i := m["timestamp"].(uint64) laMetric := laTelegrafMetric{ - Origin: fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafMetricOriginSuffix), + Origin: fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafMetricOriginSuffix), //Namespace: fmt.Sprintf("%s/%s", TelegrafMetricNamespacePrefix, m["name"]), - Namespace: fmt.Sprintf("%s", m["name"]), - Name: fmt.Sprintf("%s",k), - Value: fv, - Tags: fmt.Sprintf("%s", tagJson), - CollectionTime: time.Unix(int64(i),0).Format(time.RFC3339), - Computer: Computer, //this is the collection agent's computer name, not necessarily to which computer the metric applies to + Namespace: fmt.Sprintf("%s", m["name"]), + Name: fmt.Sprintf("%s", k), + Value: fv, + Tags: fmt.Sprintf("%s", tagJson), + CollectionTime: time.Unix(int64(i), 0).Format(time.RFC3339), + Computer: Computer, //this is the collection agent's computer name, not necessarily to which computer the metric applies to } //Log ("la metric:%v", laMetric) @@ -325,7 +368,7 @@ func translateTelegrafMetrics(m map[interface{}]interface{}) ([]*laTelegrafMetri func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int { var laMetrics []*laTelegrafMetric - if ( (telegrafRecords== nil) || ! (len(telegrafRecords) > 0) ) { + if (telegrafRecords == nil) || !(len(telegrafRecords) > 0) { Log("PostTelegrafMetricsToLA::Error:no timeseries to derive") return output.FLB_OK } @@ -340,7 +383,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int laMetrics = append(laMetrics, translatedMetrics...) } - if ( (laMetrics == nil) || !(len(laMetrics) > 0) ) { + if (laMetrics == nil) || !(len(laMetrics) > 0) { Log("PostTelegrafMetricsToLA::Info:no metrics derived from timeseries data") return output.FLB_OK } else { @@ -351,7 +394,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int var metrics []laTelegrafMetric var i int - for i=0; i < len(laMetrics); i++ { + for i = 0; i < len(laMetrics); i++ { metrics = append(metrics, *laMetrics[i]) } @@ -368,7 +411,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int SendException(message) return output.FLB_OK } - + //Post metrics data to LA req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) @@ -376,7 +419,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int //set headers req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) - + //expensive to do string len for every request, so use a flag if ResourceCentric == true { req.Header.Set("x-ms-AzureResourceId", ResourceID) @@ -420,20 +463,23 @@ func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int // PostDataHelper sends data to the OMS endpoint func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { - start := time.Now() var dataItems []DataItem var maxLatency float64 var maxLatencyContainer string - ignoreIDSet := make(map[string]bool) + stdoutIgnoreIDSet := make(map[string]bool) + stderrIgnoreIDSet := make(map[string]bool) imageIDMap := make(map[string]string) nameIDMap := make(map[string]string) DataUpdateMutex.Lock() - for k, v := range IgnoreIDSet { - ignoreIDSet[k] = v + for k, v := range StdoutIgnoreIDSet { + stdoutIgnoreIDSet[k] = v + } + for k, v := range StderrIgnoreIDSet { + stderrIgnoreIDSet[k] = v } for k, v := range ImageIDMap { imageIDMap[k] = v @@ -444,28 +490,34 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataUpdateMutex.Unlock() for _, record := range tailPluginRecords { - containerID := GetContainerIDFromFilePath(ToString(record["filepath"])) + logEntrySource := ToString(record["stream"]) - if containerID == "" || containsKey(ignoreIDSet, containerID) { - continue + if strings.EqualFold(logEntrySource, "stdout") { + if containerID == "" || containsKey(stdoutIgnoreIDSet, containerID) { + continue + } + } else if strings.EqualFold(logEntrySource, "stderr") { + if containerID == "" || containsKey(stderrIgnoreIDSet, containerID) { + continue + } } stringMap := make(map[string]string) stringMap["LogEntry"] = ToString(record["log"]) - stringMap["LogEntrySource"] = ToString(record["stream"]) + stringMap["LogEntrySource"] = logEntrySource stringMap["LogEntryTimeStamp"] = ToString(record["time"]) stringMap["SourceSystem"] = "Containers" stringMap["Id"] = containerID if val, ok := imageIDMap[containerID]; ok { stringMap["Image"] = val - } + } if val, ok := nameIDMap[containerID]; ok { stringMap["Name"] = val - } + } dataItem := DataItem{ ID: stringMap["Id"], @@ -534,7 +586,6 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } defer resp.Body.Close() - numRecords := len(dataItems) Log("Successfully flushed %d records in %s", numRecords, elapsed) ContainerLogTelemetryMutex.Lock() @@ -573,7 +624,8 @@ func GetContainerIDFromFilePath(filepath string) string { // InitializePlugin reads and populates plugin configuration func InitializePlugin(pluginConfPath string, agentVersion string) { - IgnoreIDSet = make(map[string]bool) + StdoutIgnoreIDSet = make(map[string]bool) + StderrIgnoreIDSet = make(map[string]bool) ImageIDMap = make(map[string]string) NameIDMap = make(map[string]string) @@ -606,19 +658,19 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { splitted := strings.Split(ResourceID, "/") ResourceName = splitted[len(splitted)-1] Log("ResourceCentric: True") - Log("ResourceID=%s",ResourceID) - Log("ResourceName=%s",ResourceID) - } - + Log("ResourceID=%s", ResourceID) + Log("ResourceName=%s", ResourceID) + } + if ResourceCentric == false { //AKS-Engine/hybrid scenario ResourceName = os.Getenv(ResourceNameEnv) ResourceID = ResourceName Log("ResourceCentric: False") - Log("ResourceID=%s",ResourceID) - Log("ResourceName=%s",ResourceName) + Log("ResourceID=%s", ResourceID) + Log("ResourceName=%s", ResourceName) } - + // Initialize image,name map refresh ticker containerInventoryRefreshInterval, err := strconv.Atoi(pluginConfig["container_inventory_refresh_interval"]) if err != nil { @@ -631,17 +683,16 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) ContainerImageNameRefreshTicker = time.NewTicker(time.Second * time.Duration(containerInventoryRefreshInterval)) - // Initialize Kube System Refresh Ticker - kubeSystemContainersRefreshInterval, err := strconv.Atoi(pluginConfig["kube_system_containers_refresh_interval"]) + excludeNamespacesContainersRefreshInterval, err := strconv.Atoi(pluginConfig["exclude_namespaces_containers_refresh_interval"]) if err != nil { - message := fmt.Sprintf("Error Reading Kube System Container Ids Refresh Interval %s", err.Error()) + message := fmt.Sprintf("Error Reading exclude namespaces Container Ids Refresh Interval %s", err.Error()) Log(message) SendException(message) - Log("Using Default Refresh Interval of %d s\n", defaultKubeSystemContainersRefreshInterval) - kubeSystemContainersRefreshInterval = defaultKubeSystemContainersRefreshInterval + Log("Using Default Refresh Interval of %d s\n", defaultExcludeNamespacesContainersRefreshInterval) + excludeNamespacesContainersRefreshInterval = defaultExcludeNamespacesContainersRefreshInterval } - Log("kubeSystemContainersRefreshInterval = %d \n", kubeSystemContainersRefreshInterval) - KubeSystemContainersRefreshTicker = time.NewTicker(time.Second * time.Duration(kubeSystemContainersRefreshInterval)) + Log("excludeNamespacesContainersRefreshInterval = %d \n", excludeNamespacesContainersRefreshInterval) + ExcludeNamespacesContainersRefreshTicker = time.NewTicker(time.Second * time.Duration(excludeNamespacesContainersRefreshInterval)) // Populate Computer field containerHostName, err := ioutil.ReadFile(pluginConfig["container_host_file_path"]) @@ -680,10 +731,16 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { PluginConfiguration = pluginConfig CreateHTTPClient() - if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { - go updateKubeSystemContainerIDs() - go updateContainerImageNameMaps() - } else { + + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + defaultExcludePath := os.Getenv("AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH") + //further optimization for clusters with default settings. need this cache only when log collection config is overridden with custom config + if ( (strings.Compare(defaultExcludePath, "*_kube-system_*.log") != 0) ) { + go updateExcludeStdoutContainerIDs() + go updateExcludeStderrContainerIDs() + } + go updateContainerImageNameMaps() + } else { Log("Running in replicaset. Disabling kube-system container cache collection & updates \n") } } diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index dccc6774c..2ee6f994d 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -74,7 +74,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // FLBPluginExit exits the plugin func FLBPluginExit() int { ContainerLogTelemetryTicker.Stop() - KubeSystemContainersRefreshTicker.Stop() + ExcludeNamespacesContainersRefreshTicker.Stop() ContainerImageNameRefreshTicker.Stop() return output.FLB_OK } diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 35cf727cf..b842edb29 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -13,6 +13,13 @@ class CAdvisorMetricsAPIClient require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" + @configMapMountPath = "/etc/config/settings/log-data-collection-settings" + @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] + @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] + @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] + @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] + @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] + @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil @@ -192,6 +199,16 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["PodName"] = podName telemetryProps["ContainerName"] = containerName telemetryProps["Computer"] = hostName + #telemetry about custom log collections setting + if (File.file?(@configMapMountPath)) + telemetryProps["clustercustomsettings"] = true + telemetryProps["clusterenvvars"] = @clusterEnvVarCollectionEnabled + telemetryProps["clusterstderrlogs"] = @clusterStdErrLogCollectionEnabled + telemetryProps["clusterstdoutlogs"] = @clusterStdOutLogCollectionEnabled + telemetryProps["clusterlogtailexcludepath"] = @clusterLogTailExcludPath + telemetryProps["clusterLogTailPath"] = @clusterLogTailPath + telemetryProps["clusterAgentSchemaVersion"] = @clusterAgentSchemaVersion + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index 4d83278a9..05e5bc9ea 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -47,29 +47,34 @@ def shutdown end end - def obtainContainerConfig(instance, container) + def obtainContainerConfig(instance, container, clusterCollectEnvironmentVar) begin configValue = container["Config"] if !configValue.nil? instance["ContainerHostname"] = configValue["Hostname"] - - envValue = configValue["Env"] - envValueString = (envValue.nil?) ? "" : envValue.to_s - # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE - if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) - envValueString = ["AZMON_COLLECT_ENV=FALSE"] - $log.warn("Environment Variable collection for container: #{container["Id"]} skipped because AZMON_COLLECT_ENV is set to false") - end - # Restricting the ENV string value to 200kb since the size of this string can go very high - if envValueString.length > 200000 - envValueStringTruncated = envValueString.slice(0..200000) - lastIndex = envValueStringTruncated.rindex("\", ") - if !lastIndex.nil? - envValueStringTruncated = envValueStringTruncated.slice(0..lastIndex) + "]" - end - instance["EnvironmentVar"] = envValueStringTruncated + # Check to see if the environment variable collection is disabled at the cluster level - This disables env variable collection for all containers. + if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 + instance["EnvironmentVar"] = ["AZMON_CLUSTER_COLLECT_ENV_VAR=FALSE"] else - instance["EnvironmentVar"] = envValueString + envValue = configValue["Env"] + envValueString = (envValue.nil?) ? "" : envValue.to_s + # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE + # Check to see if the environment variable collection is disabled for this container. + if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) + envValueString = ["AZMON_COLLECT_ENV=FALSE"] + $log.warn("Environment Variable collection for container: #{container["Id"]} skipped because AZMON_COLLECT_ENV is set to false") + end + # Restricting the ENV string value to 200kb since the size of this string can go very high + if envValueString.length > 200000 + envValueStringTruncated = envValueString.slice(0..200000) + lastIndex = envValueStringTruncated.rindex("\", ") + if !lastIndex.nil? + envValueStringTruncated = envValueStringTruncated.slice(0..lastIndex) + "]" + end + instance["EnvironmentVar"] = envValueStringTruncated + else + instance["EnvironmentVar"] = envValueString + end end cmdValue = configValue["Cmd"] @@ -151,7 +156,7 @@ def obtainContainerHostConfig(instance, container) end end - def inspectContainer(id, nameMap) + def inspectContainer(id, nameMap, clusterCollectEnvironmentVar) containerInstance = {} begin container = DockerApiClient.dockerInspectContainer(id) @@ -173,7 +178,7 @@ def inspectContainer(id, nameMap) containerInstance["ImageTag"] = repoImageTagArray[2] end end - obtainContainerConfig(containerInstance, container) + obtainContainerConfig(containerInstance, container, clusterCollectEnvironmentVar) obtainContainerState(containerInstance, container) obtainContainerHostConfig(containerInstance, container) end @@ -195,9 +200,13 @@ def enumerate if !containerIds.empty? eventStream = MultiEventStream.new nameMap = DockerApiClient.getImageIdMap + clusterCollectEnvironmentVar = ENV['AZMON_CLUSTER_COLLECT_ENV_VAR'] + if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 + $log.warn("Environment Variable collection disabled for cluster") + end containerIds.each do |containerId| inspectedContainer = {} - inspectedContainer = inspectContainer(containerId, nameMap) + inspectedContainer = inspectContainer(containerId, nameMap, clusterCollectEnvironmentVar) inspectedContainer["Computer"] = hostname inspectedContainer["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated containerInventory.push inspectedContainer diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 65573673c..79490ba7d 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -133,32 +133,37 @@ def populateWindowsContainerInventoryRecord(container, record, containerEnvVaria end end - def getContainerEnvironmentVariables(pod) + def getContainerEnvironmentVariables(pod, clusterCollectEnvironmentVar) begin podSpec = pod["spec"] containerEnvHash = {} if !podSpec.nil? && !podSpec["containers"].nil? podSpec["containers"].each do |container| - envVarsArray = [] - containerEnvArray = container["env"] - # Parsing the environment variable array of hashes to a string value - # since that is format being sent by container inventory workflow in daemonset - # Keeping it in the same format because the workflow expects it in this format - # and the UX expects an array of string for environment variables - if !containerEnvArray.nil? && !containerEnvArray.empty? - containerEnvArray.each do |envVarHash| - envName = envVarHash["name"] - envValue = envVarHash["value"] - envArrayElement = envName + "=" + envValue - envVarsArray.push(envArrayElement) + if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 + containerEnvHash[container["name"]] = ["AZMON_CLUSTER_COLLECT_ENV_VAR=FALSE"] + else + envVarsArray = [] + containerEnvArray = container["env"] + # Parsing the environment variable array of hashes to a string value + # since that is format being sent by container inventory workflow in daemonset + # Keeping it in the same format because the workflow expects it in this format + # and the UX expects an array of string for environment variables + if !containerEnvArray.nil? && !containerEnvArray.empty? + containerEnvArray.each do |envVarHash| + envName = envVarHash["name"] + envValue = envVarHash["value"] + envArrayElement = envName + "=" + envValue + envVarsArray.push(envArrayElement) + end end + # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE + envValueString = envVarsArray.to_s + if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) + envValueString = ["AZMON_COLLECT_ENV=FALSE"] + $log.warn("Environment Variable collection for container: #{container["name"]} skipped because AZMON_COLLECT_ENV is set to false") + end + containerEnvHash[container["name"]] = envValueString end - # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE - envValueString = envVarsArray.to_s - if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) - envValueString = ["AZMON_COLLECT_ENV=FALSE"] - end - containerEnvHash[container["name"]] = envValueString end end return containerEnvHash @@ -243,8 +248,12 @@ def parse_and_emit_records(podInventory, serviceList) # on windows nodes and parse environment variables for these containers if winNodes.length > 0 if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] + if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 + $log.warn("WindowsContainerInventory: Environment Variable collection disabled for cluster") + end sendWindowsContainerInventoryRecord = true - containerEnvVariableHash = getContainerEnvironmentVariables(items) + containerEnvVariableHash = getContainerEnvironmentVariables(items, clusterCollectEnvironmentVar) end end diff --git a/source/code/toml-parser/tomlrb.rb b/source/code/toml-parser/tomlrb.rb new file mode 100644 index 000000000..c0eff9093 --- /dev/null +++ b/source/code/toml-parser/tomlrb.rb @@ -0,0 +1,44 @@ +require "time" +require "stringio" +require_relative "tomlrb/version" +require_relative "tomlrb/string_utils" +require_relative "tomlrb/scanner" +require_relative "tomlrb/parser" +require_relative "tomlrb/handler" + +module Tomlrb + class ParseError < StandardError; end + + # Parses a valid TOML string into its Ruby data structure + # + # @param string_or_io [String, StringIO] the content + # @param options [Hash] the options hash + # @option options [Boolean] :symbolize_keys (false) whether to return the keys as symbols or strings + # @return [Hash] the Ruby data structure represented by the input + def self.parse(string_or_io, **options) + io = string_or_io.is_a?(String) ? StringIO.new(string_or_io) : string_or_io + scanner = Scanner.new(io) + parser = Parser.new(scanner, options) + begin + handler = parser.parse + rescue Racc::ParseError => e + raise ParseError, e.message + end + + handler.output + end + + # Reads a file content and parses it into its Ruby data structure + # + # @param path [String] the path to the file + # @param options [Hash] the options hash + # @option options [Boolean] :symbolize_keys (false) whether to return the keys as symbols or strings + # @return [Hash] the Ruby data structure represented by the input + def self.load_file(path, **options) + # By default Ruby sets the external encoding of an IO object to the + # default external encoding. The default external encoding is set by + # locale encoding or the interpreter -E option. + tmp = File.read(path, :encoding => "utf-8") + Tomlrb.parse(tmp, options) + end +end diff --git a/source/code/toml-parser/tomlrb/generated_parser.rb b/source/code/toml-parser/tomlrb/generated_parser.rb new file mode 100644 index 000000000..ebf815e7d --- /dev/null +++ b/source/code/toml-parser/tomlrb/generated_parser.rb @@ -0,0 +1,542 @@ +# +# DO NOT MODIFY!!!! +# This file is automatically generated by Racc 1.4.14 +# from Racc grammer file "". +# + +require 'racc/parser.rb' +module Tomlrb + class GeneratedParser < Racc::Parser +##### State transition tables begin ### + +racc_action_table = [ + 2, 17, 11, 31, 12, 31, 13, 27, 14, 77, + 15, 16, 8, 78, 32, 10, 33, 29, 34, 29, + 57, 58, 59, 60, 56, 53, 52, 54, 55, 46, + 40, 41, 10, 57, 58, 59, 60, 56, 53, 52, + 54, 55, 46, 69, 70, 10, 57, 58, 59, 60, + 56, 53, 52, 54, 55, 46, 35, 36, 10, 57, + 58, 59, 60, 56, 53, 52, 54, 55, 46, 37, + 38, 10, 57, 58, 59, 60, 56, 53, 52, 54, + 55, 46, 43, 66, 10, 57, 58, 59, 60, 56, + 53, 52, 54, 55, 46, nil, nil, 10, 57, 58, + 59, 60, 56, 53, 52, 54, 55, 46, nil, nil, + 10, 57, 58, 59, 60, 56, 53, 52, 54, 55, + 46, 73, nil, 10, 57, 58, 59, 60, 56, 53, + 52, 54, 55, 46, 73, 21, 10, 22, nil, 23, + nil, 24, nil, 25, 26, 21, 19, 22, nil, 23, + nil, 24, nil, 25, 26, nil, 19 ] + +racc_action_check = [ + 1, 2, 1, 9, 1, 70, 1, 8, 1, 74, + 1, 1, 1, 74, 11, 1, 12, 9, 13, 70, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 20, 20, 32, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 42, 42, 33, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 14, 15, 34, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 16, + 19, 35, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 30, 40, 36, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, nil, nil, 37, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, nil, nil, + 43, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, nil, 45, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 7, 78, 7, nil, 7, + nil, 7, nil, 7, 7, 41, 7, 41, nil, 41, + nil, 41, nil, 41, 41, nil, 41 ] + +racc_action_pointer = [ + nil, 0, 1, nil, nil, nil, nil, 133, -5, 1, + nil, -4, -2, 0, 38, 39, 51, nil, nil, 57, + 17, nil, nil, nil, nil, nil, nil, nil, nil, nil, + 64, nil, 17, 30, 43, 56, 69, 82, nil, nil, + 70, 143, 27, 95, nil, 108, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + 3, nil, nil, nil, -4, nil, nil, nil, 121, nil ] + +racc_action_default = [ + -1, -56, -56, -2, -3, -4, -5, -56, -8, -56, + -22, -56, -56, -56, -56, -56, -56, 80, -6, -10, + -56, -15, -16, -17, -18, -19, -20, -7, -21, -23, + -56, -27, -46, -46, -46, -46, -46, -46, -9, -11, + -13, -56, -56, -46, -29, -46, -40, -41, -42, -43, + -44, -45, -47, -48, -49, -50, -51, -52, -53, -54, + -55, -30, -31, -32, -33, -34, -12, -14, -24, -25, + -56, -28, -35, -36, -56, -26, -37, -38, -46, -39 ] + +racc_goto_table = [ + 28, 18, 1, 72, 44, 61, 62, 63, 64, 65, + 3, 4, 5, 6, 7, 71, 39, 42, 68, 76, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, 67, 79, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, 75 ] + +racc_goto_check = [ + 11, 7, 1, 18, 15, 15, 15, 15, 15, 15, + 2, 3, 4, 5, 6, 15, 9, 13, 14, 19, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, 7, 18, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, 11 ] + +racc_goto_pointer = [ + nil, 2, 9, 10, 11, 12, 13, -6, nil, -4, + nil, -9, nil, -13, -24, -28, nil, nil, -42, -55, + nil, nil, nil ] + +racc_goto_default = [ + nil, nil, nil, nil, nil, 49, nil, nil, 20, nil, + 9, nil, 30, nil, nil, 74, 48, 45, nil, nil, + 47, 50, 51 ] + +racc_reduce_table = [ + 0, 0, :racc_error, + 0, 20, :_reduce_none, + 2, 20, :_reduce_none, + 1, 21, :_reduce_none, + 1, 21, :_reduce_none, + 1, 21, :_reduce_none, + 2, 22, :_reduce_none, + 2, 25, :_reduce_7, + 1, 25, :_reduce_8, + 2, 26, :_reduce_9, + 1, 26, :_reduce_10, + 2, 26, :_reduce_none, + 2, 28, :_reduce_12, + 1, 28, :_reduce_13, + 2, 28, :_reduce_none, + 1, 27, :_reduce_15, + 1, 27, :_reduce_16, + 1, 27, :_reduce_17, + 1, 27, :_reduce_18, + 1, 27, :_reduce_19, + 1, 27, :_reduce_20, + 2, 24, :_reduce_none, + 1, 29, :_reduce_22, + 1, 30, :_reduce_23, + 3, 30, :_reduce_none, + 1, 33, :_reduce_25, + 2, 33, :_reduce_none, + 1, 31, :_reduce_27, + 2, 32, :_reduce_none, + 3, 23, :_reduce_29, + 3, 23, :_reduce_30, + 3, 23, :_reduce_31, + 3, 23, :_reduce_32, + 3, 23, :_reduce_33, + 3, 23, :_reduce_34, + 2, 35, :_reduce_none, + 1, 37, :_reduce_36, + 2, 37, :_reduce_none, + 1, 38, :_reduce_38, + 2, 38, :_reduce_none, + 1, 36, :_reduce_40, + 1, 34, :_reduce_41, + 1, 34, :_reduce_none, + 1, 34, :_reduce_none, + 1, 39, :_reduce_none, + 1, 39, :_reduce_none, + 0, 41, :_reduce_none, + 1, 41, :_reduce_47, + 1, 41, :_reduce_48, + 1, 41, :_reduce_49, + 1, 41, :_reduce_50, + 1, 41, :_reduce_51, + 1, 40, :_reduce_52, + 1, 40, :_reduce_53, + 1, 40, :_reduce_54, + 1, 40, :_reduce_55 ] + +racc_reduce_n = 56 + +racc_shift_n = 80 + +racc_token_table = { + false => 0, + :error => 1, + :IDENTIFIER => 2, + :STRING_MULTI => 3, + :STRING_BASIC => 4, + :STRING_LITERAL_MULTI => 5, + :STRING_LITERAL => 6, + :DATETIME => 7, + :INTEGER => 8, + :FLOAT => 9, + :TRUE => 10, + :FALSE => 11, + "[" => 12, + "]" => 13, + "." => 14, + "{" => 15, + "}" => 16, + "," => 17, + "=" => 18 } + +racc_nt_base = 19 + +racc_use_result_var = true + +Racc_arg = [ + racc_action_table, + racc_action_check, + racc_action_default, + racc_action_pointer, + racc_goto_table, + racc_goto_check, + racc_goto_default, + racc_goto_pointer, + racc_nt_base, + racc_reduce_table, + racc_token_table, + racc_shift_n, + racc_reduce_n, + racc_use_result_var ] + +Racc_token_to_s_table = [ + "$end", + "error", + "IDENTIFIER", + "STRING_MULTI", + "STRING_BASIC", + "STRING_LITERAL_MULTI", + "STRING_LITERAL", + "DATETIME", + "INTEGER", + "FLOAT", + "TRUE", + "FALSE", + "\"[\"", + "\"]\"", + "\".\"", + "\"{\"", + "\"}\"", + "\",\"", + "\"=\"", + "$start", + "expressions", + "expression", + "table", + "assignment", + "inline_table", + "table_start", + "table_continued", + "table_identifier", + "table_next", + "inline_table_start", + "inline_continued", + "inline_assignment_key", + "inline_assignment_value", + "inline_next", + "value", + "array", + "start_array", + "array_continued", + "array_next", + "scalar", + "string", + "literal" ] + +Racc_debug_parser = false + +##### State transition tables end ##### + +# reduce 0 omitted + +# reduce 1 omitted + +# reduce 2 omitted + +# reduce 3 omitted + +# reduce 4 omitted + +# reduce 5 omitted + +# reduce 6 omitted + +module_eval(<<'.,.,', 'parser.y', 15) + def _reduce_7(val, _values, result) + @handler.start_(:array_of_tables) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 16) + def _reduce_8(val, _values, result) + @handler.start_(:table) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 19) + def _reduce_9(val, _values, result) + array = @handler.end_(:array_of_tables); @handler.set_context(array, is_array_of_tables: true) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 20) + def _reduce_10(val, _values, result) + array = @handler.end_(:table); @handler.set_context(array) + result + end +.,., + +# reduce 11 omitted + +module_eval(<<'.,.,', 'parser.y', 24) + def _reduce_12(val, _values, result) + array = @handler.end_(:array_of_tables); @handler.set_context(array, is_array_of_tables: true) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 25) + def _reduce_13(val, _values, result) + array = @handler.end_(:table); @handler.set_context(array) + result + end +.,., + +# reduce 14 omitted + +module_eval(<<'.,.,', 'parser.y', 29) + def _reduce_15(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 30) + def _reduce_16(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 31) + def _reduce_17(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 32) + def _reduce_18(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 33) + def _reduce_19(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 34) + def _reduce_20(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +# reduce 21 omitted + +module_eval(<<'.,.,', 'parser.y', 40) + def _reduce_22(val, _values, result) + @handler.start_(:inline) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 43) + def _reduce_23(val, _values, result) + array = @handler.end_(:inline); @handler.push(Hash[*array]) + result + end +.,., + +# reduce 24 omitted + +module_eval(<<'.,.,', 'parser.y', 48) + def _reduce_25(val, _values, result) + array = @handler.end_(:inline) + array.map!.with_index{ |n,i| i.even? ? n.to_sym : n } if @handler.symbolize_keys + @handler.push(Hash[*array]) + + result + end +.,., + +# reduce 26 omitted + +module_eval(<<'.,.,', 'parser.y', 55) + def _reduce_27(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +# reduce 28 omitted + +module_eval(<<'.,.,', 'parser.y', 61) + def _reduce_29(val, _values, result) + @handler.assign(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 62) + def _reduce_30(val, _values, result) + @handler.assign(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 63) + def _reduce_31(val, _values, result) + @handler.assign(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 64) + def _reduce_32(val, _values, result) + @handler.assign(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 65) + def _reduce_33(val, _values, result) + @handler.assign(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 66) + def _reduce_34(val, _values, result) + @handler.assign(val[0]) + result + end +.,., + +# reduce 35 omitted + +module_eval(<<'.,.,', 'parser.y', 72) + def _reduce_36(val, _values, result) + array = @handler.end_(:array); @handler.push(array) + result + end +.,., + +# reduce 37 omitted + +module_eval(<<'.,.,', 'parser.y', 76) + def _reduce_38(val, _values, result) + array = @handler.end_(:array); @handler.push(array) + result + end +.,., + +# reduce 39 omitted + +module_eval(<<'.,.,', 'parser.y', 80) + def _reduce_40(val, _values, result) + @handler.start_(:array) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 83) + def _reduce_41(val, _values, result) + @handler.push(val[0]) + result + end +.,., + +# reduce 42 omitted + +# reduce 43 omitted + +# reduce 44 omitted + +# reduce 45 omitted + +# reduce 46 omitted + +module_eval(<<'.,.,', 'parser.y', 92) + def _reduce_47(val, _values, result) + result = val[0].to_f + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 93) + def _reduce_48(val, _values, result) + result = val[0].to_i + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 94) + def _reduce_49(val, _values, result) + result = true + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 95) + def _reduce_50(val, _values, result) + result = false + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 96) + def _reduce_51(val, _values, result) + result = Time.new(*val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 99) + def _reduce_52(val, _values, result) + result = StringUtils.replace_escaped_chars(StringUtils.multiline_replacements(val[0])) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 100) + def _reduce_53(val, _values, result) + result = StringUtils.replace_escaped_chars(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 101) + def _reduce_54(val, _values, result) + result = StringUtils.strip_spaces(val[0]) + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 102) + def _reduce_55(val, _values, result) + result = val[0] + result + end +.,., + +def _reduce_none(val, _values, result) + val[0] +end + + end # class GeneratedParser + end # module Tomlrb diff --git a/source/code/toml-parser/tomlrb/handler.rb b/source/code/toml-parser/tomlrb/handler.rb new file mode 100644 index 000000000..d60b54bc3 --- /dev/null +++ b/source/code/toml-parser/tomlrb/handler.rb @@ -0,0 +1,73 @@ +module Tomlrb + class Handler + attr_reader :output, :symbolize_keys + + def initialize(**options) + @output = {} + @current = @output + @stack = [] + @array_names = [] + @symbolize_keys = options[:symbolize_keys] + end + + def set_context(identifiers, is_array_of_tables: false) + @current = @output + + deal_with_array_of_tables(identifiers, is_array_of_tables) do |identifierz| + identifierz.each do |k| + k = k.to_sym if @symbolize_keys + if @current[k].is_a?(Array) + @current[k] << {} if @current[k].empty? + @current = @current[k].last + else + @current[k] ||= {} + @current = @current[k] + end + end + end + end + + def deal_with_array_of_tables(identifiers, is_array_of_tables) + identifiers.map!{|n| n.gsub("\"", '')} + stringified_identifier = identifiers.join('.') + + if is_array_of_tables + @array_names << stringified_identifier + last_identifier = identifiers.pop + elsif @array_names.include?(stringified_identifier) + raise ParseError, 'Cannot define a normal table with the same name as an already established array' + end + + yield(identifiers) + + if is_array_of_tables + last_identifier = last_identifier.to_sym if @symbolize_keys + @current[last_identifier] ||= [] + @current[last_identifier] << {} + @current = @current[last_identifier].last + end + end + + def assign(k) + k = k.to_sym if @symbolize_keys + @current[k] = @stack.pop + end + + def push(o) + @stack << o + end + + def start_(type) + push([type]) + end + + def end_(type) + array = [] + while (value = @stack.pop) != [type] + raise ParseError, 'Unclosed table' unless value + array.unshift(value) + end + array + end + end +end diff --git a/source/code/toml-parser/tomlrb/parser.rb b/source/code/toml-parser/tomlrb/parser.rb new file mode 100644 index 000000000..31771a1ca --- /dev/null +++ b/source/code/toml-parser/tomlrb/parser.rb @@ -0,0 +1,18 @@ +require_relative "generated_parser" + +class Tomlrb::Parser < Tomlrb::GeneratedParser + def initialize(tokenizer, **options) + @tokenizer = tokenizer + @handler = Tomlrb::Handler.new(options) + super() + end + + def next_token + @tokenizer.next_token + end + + def parse + do_parse + @handler + end +end diff --git a/source/code/toml-parser/tomlrb/parser.y b/source/code/toml-parser/tomlrb/parser.y new file mode 100644 index 000000000..fcebcac06 --- /dev/null +++ b/source/code/toml-parser/tomlrb/parser.y @@ -0,0 +1,104 @@ +class Tomlrb::GeneratedParser +token IDENTIFIER STRING_MULTI STRING_BASIC STRING_LITERAL_MULTI STRING_LITERAL DATETIME INTEGER FLOAT TRUE FALSE +rule + expressions + | expressions expression + ; + expression + : table + | assignment + | inline_table + ; + table + : table_start table_continued + ; + table_start + : '[' '[' { @handler.start_(:array_of_tables) } + | '[' { @handler.start_(:table) } + ; + table_continued + : ']' ']' { array = @handler.end_(:array_of_tables); @handler.set_context(array, is_array_of_tables: true) } + | ']' { array = @handler.end_(:table); @handler.set_context(array) } + | table_identifier table_next + ; + table_next + : ']' ']' { array = @handler.end_(:array_of_tables); @handler.set_context(array, is_array_of_tables: true) } + | ']' { array = @handler.end_(:table); @handler.set_context(array) } + | '.' table_continued + ; + table_identifier + : IDENTIFIER { @handler.push(val[0]) } + | STRING_BASIC { @handler.push(val[0]) } + | STRING_LITERAL { @handler.push(val[0]) } + | INTEGER { @handler.push(val[0]) } + | TRUE { @handler.push(val[0]) } + | FALSE { @handler.push(val[0]) } + ; + inline_table + : inline_table_start inline_continued + ; + inline_table_start + : '{' { @handler.start_(:inline) } + ; + inline_continued + : '}' { array = @handler.end_(:inline); @handler.push(Hash[*array]) } + | inline_assignment_key inline_assignment_value inline_next + ; + inline_next + : '}' { + array = @handler.end_(:inline) + array.map!.with_index{ |n,i| i.even? ? n.to_sym : n } if @handler.symbolize_keys + @handler.push(Hash[*array]) + } + | ',' inline_continued + ; + inline_assignment_key + : IDENTIFIER { @handler.push(val[0]) } + ; + inline_assignment_value + : '=' value + ; + assignment + : IDENTIFIER '=' value { @handler.assign(val[0]) } + | STRING_BASIC '=' value { @handler.assign(val[0]) } + | STRING_LITERAL '=' value { @handler.assign(val[0]) } + | INTEGER '=' value { @handler.assign(val[0]) } + | TRUE '=' value { @handler.assign(val[0]) } + | FALSE '=' value { @handler.assign(val[0]) } + ; + array + : start_array array_continued + ; + array_continued + : ']' { array = @handler.end_(:array); @handler.push(array) } + | value array_next + ; + array_next + : ']' { array = @handler.end_(:array); @handler.push(array) } + | ',' array_continued + ; + start_array + : '[' { @handler.start_(:array) } + ; + value + : scalar { @handler.push(val[0]) } + | array + | inline_table + ; + scalar + : string + | literal + ; + literal + | FLOAT { result = val[0].to_f } + | INTEGER { result = val[0].to_i } + | TRUE { result = true } + | FALSE { result = false } + | DATETIME { result = Time.new(*val[0])} + ; + string + : STRING_MULTI { result = StringUtils.replace_escaped_chars(StringUtils.multiline_replacements(val[0])) } + | STRING_BASIC { result = StringUtils.replace_escaped_chars(val[0]) } + | STRING_LITERAL_MULTI { result = StringUtils.strip_spaces(val[0]) } + | STRING_LITERAL { result = val[0] } + ; diff --git a/source/code/toml-parser/tomlrb/scanner.rb b/source/code/toml-parser/tomlrb/scanner.rb new file mode 100644 index 000000000..d0f479eef --- /dev/null +++ b/source/code/toml-parser/tomlrb/scanner.rb @@ -0,0 +1,54 @@ +require 'strscan' + +module Tomlrb + class Scanner + COMMENT = /#.*/ + IDENTIFIER = /[A-Za-z0-9_-]+/ + SPACE = /[ \t\r\n]/ + STRING_BASIC = /(["])(?:\\?.)*?\1/ + STRING_MULTI = /"{3}([\s\S]*?"{3,4})/m + STRING_LITERAL = /(['])(?:\\?.)*?\1/ + STRING_LITERAL_MULTI = /'{3}([\s\S]*?'{3})/m + DATETIME = /(-?\d{4})-(\d{2})-(\d{2})(?:(?:t|\s)(\d{2}):(\d{2}):(\d{2}(?:\.\d+)?))?(z|[-+]\d{2}:\d{2})?/i + FLOAT = /[+-]?(?:[0-9_]+\.[0-9_]*|\d+(?=[eE]))(?:[eE][+-]?[0-9_]+)?/ + INTEGER = /[+-]?([1-9](_?\d)*|0)(?![A-Za-z0-9_-]+)/ + TRUE = /true/ + FALSE = /false/ + + def initialize(io) + @ss = StringScanner.new(io.read) + end + + def next_token + return if @ss.eos? + + case + when @ss.scan(SPACE) then next_token + when @ss.scan(COMMENT) then next_token + when @ss.scan(DATETIME) then process_datetime + when text = @ss.scan(STRING_MULTI) then [:STRING_MULTI, text[3..-4]] + when text = @ss.scan(STRING_BASIC) then [:STRING_BASIC, text[1..-2]] + when text = @ss.scan(STRING_LITERAL_MULTI) then [:STRING_LITERAL_MULTI, text[3..-4]] + when text = @ss.scan(STRING_LITERAL) then [:STRING_LITERAL, text[1..-2]] + when text = @ss.scan(FLOAT) then [:FLOAT, text] + when text = @ss.scan(INTEGER) then [:INTEGER, text] + when text = @ss.scan(TRUE) then [:TRUE, text] + when text = @ss.scan(FALSE) then [:FALSE, text] + when text = @ss.scan(IDENTIFIER) then [:IDENTIFIER, text] + else + x = @ss.getch + [x, x] + end + end + + def process_datetime + if @ss[7].nil? + offset = '+00:00' + else + offset = @ss[7].gsub('Z', '+00:00') + end + args = [@ss[1], @ss[2], @ss[3], @ss[4] || 0, @ss[5] || 0, @ss[6].to_f, offset] + [:DATETIME, args] + end + end +end diff --git a/source/code/toml-parser/tomlrb/string_utils.rb b/source/code/toml-parser/tomlrb/string_utils.rb new file mode 100644 index 000000000..53d27e414 --- /dev/null +++ b/source/code/toml-parser/tomlrb/string_utils.rb @@ -0,0 +1,33 @@ +module Tomlrb + class StringUtils + + SPECIAL_CHARS = { + '\\t' => "\t", + '\\b' => "\b", + '\\f' => "\f", + '\\n' => "\n", + '\\r' => "\r", + '\\"' => '"', + '\\\\' => '\\' + }.freeze + + def self.multiline_replacements(str) + strip_spaces(str).gsub(/\\\n\s+/, '') + end + + def self.replace_escaped_chars(str) + str.gsub(/\\(u[\da-fA-F]{4}|U[\da-fA-F]{8}|.)/) do |m| + if m.size == 2 + SPECIAL_CHARS[m] || (raise Tomlrb::ParseError.new "Escape sequence #{m} is reserved") + else + m[2..-1].to_i(16).chr(Encoding::UTF_8) + end + end + end + + def self.strip_spaces(str) + str[0] = '' if str[0] == "\n" + str + end + end +end diff --git a/source/code/toml-parser/tomlrb/version.rb b/source/code/toml-parser/tomlrb/version.rb new file mode 100644 index 000000000..b72a81b60 --- /dev/null +++ b/source/code/toml-parser/tomlrb/version.rb @@ -0,0 +1,3 @@ +module Tomlrb + VERSION = "1.2.8" +end From 727d5bd691b50192d7b6879fe68e438ee7a7fdc1 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 6 Jun 2019 16:47:55 -0700 Subject: [PATCH 092/160] Fix Scenario when Controller name is empty (#222) --- source/code/plugin/filter_inventory2mdm.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb index f98a3224e..30f6f911a 100644 --- a/source/code/plugin/filter_inventory2mdm.rb +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -176,6 +176,10 @@ def process_pod_inventory_records(es) podControllerNameDimValue = record['DataItems'][0]['ControllerName'] podNodeDimValue = record['DataItems'][0]['Computer'] + if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? + podControllerNameDimValue = 'No Controller' + end + if podNodeDimValue.empty? && podPhaseDimValue.downcase == 'pending' podNodeDimValue = 'unscheduled' elsif podNodeDimValue.empty? From 5e4b0f3f817b1d51b9b0830acd71ebae90b7fe2d Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 6 Jun 2019 17:01:51 -0700 Subject: [PATCH 093/160] fix ; --- installer/datafiles/base_container.data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index fd070426c..58a74aa0a 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -112,7 +112,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root -/opt/tomlparser.rb; installer/scripts/tomlparser.rb 755; root; root +/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root From 6fefcac8db6db2fb97c7480ddb93036b1b65f092 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Sat, 8 Jun 2019 08:26:37 -0700 Subject: [PATCH 094/160] ContainerLog collection optimizations (#223) * * derive k8s namespace from file (rather than making a api call) * optimize perf by not tailing excluded namespaces in stdout & stderr * Tuning fluentbit settings based on Cortana teams findings * making db sync off * buffer chunk and max as 1m so that we dont flush > 1m payloads * increasing rotatte wait from 5 secs to 30 secs * decreasing refresh interval from 60 secs to 30 secs * adding retry limit as 10 so that items get dropped in 50 secs rather than infinetely trying * changing flush to 5 secs from 30 secs --- installer/conf/out_oms.conf | 2 - installer/conf/td-agent-bit-rs.conf | 1 + installer/conf/td-agent-bit.conf | 8 +- installer/scripts/tomlparser.rb | 21 +++- source/code/go/src/plugins/oms.go | 169 +++++++++----------------- source/code/go/src/plugins/out_oms.go | 1 - 6 files changed, 82 insertions(+), 120 deletions(-) diff --git a/installer/conf/out_oms.conf b/installer/conf/out_oms.conf index d6679f982..7af7b6fdd 100644 --- a/installer/conf/out_oms.conf +++ b/installer/conf/out_oms.conf @@ -3,5 +3,3 @@ cert_file_path=/etc/opt/microsoft/omsagent/certs/oms.crt key_file_path=/etc/opt/microsoft/omsagent/certs/oms.key container_host_file_path=/var/opt/microsoft/docker-cimprov/state/containerhostname container_inventory_refresh_interval=60 -#kube_system_containers_refresh_interval=300 -exclude_namespaces_containers_refresh_interval=60 diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf index 740f8a951..7945261aa 100644 --- a/installer/conf/td-agent-bit-rs.conf +++ b/installer/conf/td-agent-bit-rs.conf @@ -25,5 +25,6 @@ [OUTPUT] Name oms EnableTelemetry true + Retry_Limit 10 TelemetryPushIntervalSeconds 300 Match oms.container.* diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index d1a045063..14728af5d 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -1,5 +1,5 @@ [SERVICE] - Flush 30 + Flush 5 Log_Level info Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log @@ -9,8 +9,13 @@ Tag oms.container.log.* Path ${AZMON_LOG_TAIL_PATH} DB /var/log/omsagent-fblogs.db + DB.Sync Off Parser docker Mem_Buf_Limit 5m + Buffer_Chunk_Size 1m + Buffer_Max_Size 1m + Rotate_Wait 20 + Refresh_Interval 30 Path_Key filepath Skip_Long_Lines On Ignore_Older 5m @@ -53,5 +58,6 @@ [OUTPUT] Name oms EnableTelemetry true + Retry_Limit 10 TelemetryPushIntervalSeconds 300 Match oms.container.* diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index 52516641a..abc8b8e19 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -13,7 +13,7 @@ @collectClusterEnvVariables = true @logTailPath = "/var/log/containers/*.log" @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" -@excludePath = "*.csv2" +@excludePath = "*.csv2" #some invalid path # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -76,11 +76,14 @@ def populateSettingValuesFromConfigMap(parsedConfig) @collectStderrLogs = parsedConfig[:log_collection_settings][:stderr][:enabled] puts "config::Using config map setting for stderr log collection" stderrNamespaces = parsedConfig[:log_collection_settings][:stderr][:exclude_namespaces] - + stdoutNamespaces = Array.new #Clearing it, so that it can be overridden with the config map settings @stderrExcludeNamespaces.clear if @collectStderrLogs && !stderrNamespaces.nil? if stderrNamespaces.kind_of?(Array) + if !@stdoutExcludeNamespaces.nil? && !@stdoutExcludeNamespaces.empty? + stdoutNamespaces = @stdoutExcludeNamespaces.split(',') + end # Checking only for the first element to be string because toml enforces the arrays to contain elements of same type if stderrNamespaces.length > 0 && stderrNamespaces[0].kind_of?(String) stderrNamespaces.each do |namespace| @@ -90,6 +93,10 @@ def populateSettingValuesFromConfigMap(parsedConfig) else @stderrExcludeNamespaces.concat("," + namespace) end + # Add this namespace to excludepath if both stdout & stderr are excluded for this namespace, to ensure are optimized and dont tail these files at all + if stdoutNamespaces.include? namespace + @excludePath.concat("," + "*_" + namespace + "_*.log") + end end puts "config::Using config map setting for stderr log collection to exclude namespace" end @@ -113,13 +120,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) end @configSchemaVersion = ENV['AZMON_AGENT_CFG_SCHEMA_VERSION'] - if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @@configSchemaVersion.strip.casecmp('v1') == 0 #note v1 is the only supported schema version , so hardcoding it + if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp('v1') == 0 #note v1 is the only supported schema version , so hardcoding it + puts "****************Start Config Processing********************" configMapSettings = parseConfigMap if !configMapSettings.nil? populateSettingValuesFromConfigMap(configMapSettings) end else - puts "config::unsupported config schema version - #{@configSchemaVersion}, using defaults" + if (File.file?(@configMapMountPath)) + puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" + end @excludePath = "*_kube-system_*.log" end @@ -147,6 +157,9 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") # Close file after writing all environment variables file.close + puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " + puts "****************End Config Processing********************" else puts "config::error::Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" end diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 0ffaaff63..ae4a109a6 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -17,7 +17,6 @@ import ( lumberjack "gopkg.in/natefinch/lumberjack.v2" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -57,9 +56,6 @@ const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimp const IPName = "Containers" const defaultContainerInventoryRefreshInterval = 60 -// const defaultKubeSystemContainersRefreshInterval = 300 -const defaultExcludeNamespacesContainersRefreshInterval = 300 - var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -84,10 +80,10 @@ var ( ImageIDMap map[string]string // NameIDMap caches the container it to Name mapping NameIDMap map[string]string - // StdoutIgnoreIDSet set of container Ids of excluded namespaces for stdout logs - StdoutIgnoreIDSet map[string]bool - // StderrIgnoreIDSet set of container Ids of excluded namespaces for stderr logs - StderrIgnoreIDSet map[string]bool + // StdoutIgnoreNamespaceSet set of excluded K8S namespaces for stdout logs + StdoutIgnoreNsSet map[string]bool + // StderrIgnoreNamespaceSet set of excluded K8S namespaces for stderr logs + StderrIgnoreNsSet map[string]bool // DataUpdateMutex read and write mutex access to the container id set DataUpdateMutex = &sync.Mutex{} // ContainerLogTelemetryMutex read and write mutex access to the Container Log Telemetry @@ -97,8 +93,6 @@ var ( ) var ( - // ExcludeNamespacesContainersRefreshTicker updates the excludenamespace containers - ExcludeNamespacesContainersRefreshTicker *time.Ticker // ContainerImageNameRefreshTicker updates the container image and names periodically ContainerImageNameRefreshTicker *time.Ticker ) @@ -225,70 +219,28 @@ func updateContainerImageNameMaps() { } } -func excludeContainerIDPopulator(excludeNamespaceList []string, logStream string) { - var podsToExclude []*corev1.PodList - listOptions := metav1.ListOptions{} - listOptions.FieldSelector = fmt.Sprintf("spec.nodeName=%s", Computer) - - pods, err := ClientSet.CoreV1().Pods("").List(listOptions) - if err != nil { - message := fmt.Sprintf("Error getting pods %s - for node %s . All %s logs might be collected", err.Error(), Computer, logStream) - SendException(message) - Log(message) - return - } - - podsToExclude = append(podsToExclude, pods) - ignoreNamespaceSet := make(map[string]bool) - for _, ns := range excludeNamespaceList { - ignoreNamespaceSet[strings.TrimSpace(ns)] = true - } - - _ignoreIDSet := make(map[string]bool) - for _, pod := range podsToExclude { - for _, pod := range pod.Items { - _, ok := ignoreNamespaceSet[pod.Namespace] - if ok { - Log ("Adding pod %s in namespace %s to %s exclusion list", pod.Name, pod.Namespace, logStream) - for _, status := range pod.Status.ContainerStatuses { - lastSlashIndex := strings.LastIndex(status.ContainerID, "/") - _ignoreIDSet[status.ContainerID[lastSlashIndex+1:len(status.ContainerID)]] = true - } - } - } - } - - Log("Locking to update excluded container IDs for %s", logStream) - DataUpdateMutex.Lock() - if strings.Compare(logStream, "stdout") == 0 { - StdoutIgnoreIDSet = _ignoreIDSet - } else { - StderrIgnoreIDSet = _ignoreIDSet - } - DataUpdateMutex.Unlock() - Log("Unlocking after updating excluded container IDs for %s", logStream) -} - -func updateExcludeStdoutContainerIDs() { - for ; true; <-ExcludeNamespacesContainersRefreshTicker.C { - collectStdoutLogs := os.Getenv("AZMON_COLLECT_STDOUT_LOGS") - var stdoutNSExcludeList []string - excludeList := os.Getenv("AZMON_STDOUT_EXCLUDED_NAMESPACES") - if (strings.Compare(collectStdoutLogs, "true") == 0) && (len(excludeList) > 0) { - stdoutNSExcludeList = strings.Split(excludeList, ",") - excludeContainerIDPopulator(stdoutNSExcludeList, "stdout") +func populateExcludedStdoutNamespaces() { + collectStdoutLogs := os.Getenv("AZMON_COLLECT_STDOUT_LOGS") + var stdoutNSExcludeList []string + excludeList := os.Getenv("AZMON_STDOUT_EXCLUDED_NAMESPACES") + if (strings.Compare(collectStdoutLogs, "true") == 0) && (len(excludeList) > 0) { + stdoutNSExcludeList = strings.Split(excludeList, ",") + for _, ns := range stdoutNSExcludeList { + Log ("Excluding namespace %s for stdout log collection", ns) + StdoutIgnoreNsSet[strings.TrimSpace(ns)] = true } } } -func updateExcludeStderrContainerIDs() { - for ; true; <-ExcludeNamespacesContainersRefreshTicker.C { - collectStderrLogs := os.Getenv("AZMON_COLLECT_STDERR_LOGS") - var stderrNSExcludeList []string - excludeList := os.Getenv("AZMON_STDERR_EXCLUDED_NAMESPACES") - if (strings.Compare(collectStderrLogs, "true") == 0) && (len(excludeList) > 0) { - stderrNSExcludeList = strings.Split(excludeList, ",") - excludeContainerIDPopulator(stderrNSExcludeList, "stderr") +func populateExcludedStderrNamespaces() { + collectStderrLogs := os.Getenv("AZMON_COLLECT_STDERR_LOGS") + var stderrNSExcludeList []string + excludeList := os.Getenv("AZMON_STDERR_EXCLUDED_NAMESPACES") + if (strings.Compare(collectStderrLogs, "true") == 0) && (len(excludeList) > 0) { + stderrNSExcludeList = strings.Split(excludeList, ",") + for _, ns := range stderrNSExcludeList { + Log ("Excluding namespace %s for stderr log collection", ns) + StderrIgnoreNsSet[strings.TrimSpace(ns)] = true } } } @@ -469,18 +421,11 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { var maxLatency float64 var maxLatencyContainer string - stdoutIgnoreIDSet := make(map[string]bool) - stderrIgnoreIDSet := make(map[string]bool) imageIDMap := make(map[string]string) nameIDMap := make(map[string]string) DataUpdateMutex.Lock() - for k, v := range StdoutIgnoreIDSet { - stdoutIgnoreIDSet[k] = v - } - for k, v := range StderrIgnoreIDSet { - stderrIgnoreIDSet[k] = v - } + for k, v := range ImageIDMap { imageIDMap[k] = v } @@ -490,15 +435,15 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataUpdateMutex.Unlock() for _, record := range tailPluginRecords { - containerID := GetContainerIDFromFilePath(ToString(record["filepath"])) + containerID, k8sNamespace := GetContainerIDK8sNamespaceFromFileName(ToString(record["filepath"])) logEntrySource := ToString(record["stream"]) if strings.EqualFold(logEntrySource, "stdout") { - if containerID == "" || containsKey(stdoutIgnoreIDSet, containerID) { + if containerID == "" || containsKey(StdoutIgnoreNsSet, k8sNamespace) { continue } } else if strings.EqualFold(logEntrySource, "stderr") { - if containerID == "" || containsKey(stderrIgnoreIDSet, containerID) { + if containerID == "" || containsKey(StderrIgnoreNsSet, k8sNamespace) { continue } } @@ -608,24 +553,38 @@ func containsKey(currentMap map[string]bool, key string) bool { return c } -// GetContainerIDFromFilePath Gets the container ID From the file Path -func GetContainerIDFromFilePath(filepath string) string { - start := strings.LastIndex(filepath, "-") - end := strings.LastIndex(filepath, ".") +// GetContainerIDK8sNamespaceFromFileName Gets the container ID From the file Name +// sample filename kube-proxy-dgcx7_kube-system_kube-proxy-8df7e49e9028b60b5b0d0547f409c455a9567946cf763267b7e6fa053ab8c182.log +func GetContainerIDK8sNamespaceFromFileName(filename string) (string, string) { + id := "" + ns := "" + + start := strings.LastIndex(filename, "-") + end := strings.LastIndex(filename, ".") + + if start >= end || start == -1 || end == -1 { + id = "" + } else { + id = filename[start+1 : end] + } + + start = strings.Index(filename, "_") + end = strings.LastIndex(filename, "_") + if start >= end || start == -1 || end == -1 { - // This means the file is not a managed Kubernetes docker log file. - // Drop all records from the file - Log("File %s is not a Kubernetes managed docker log file. Dropping all records from the file", filepath) - return "" + ns = "" + } else { + ns = filename[start+1 : end] } - return filepath[start+1 : end] + + return id, ns } // InitializePlugin reads and populates plugin configuration func InitializePlugin(pluginConfPath string, agentVersion string) { - StdoutIgnoreIDSet = make(map[string]bool) - StderrIgnoreIDSet = make(map[string]bool) + StdoutIgnoreNsSet = make(map[string]bool) + StderrIgnoreNsSet = make(map[string]bool) ImageIDMap = make(map[string]string) NameIDMap = make(map[string]string) @@ -683,16 +642,6 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) ContainerImageNameRefreshTicker = time.NewTicker(time.Second * time.Duration(containerInventoryRefreshInterval)) - excludeNamespacesContainersRefreshInterval, err := strconv.Atoi(pluginConfig["exclude_namespaces_containers_refresh_interval"]) - if err != nil { - message := fmt.Sprintf("Error Reading exclude namespaces Container Ids Refresh Interval %s", err.Error()) - Log(message) - SendException(message) - Log("Using Default Refresh Interval of %d s\n", defaultExcludeNamespacesContainersRefreshInterval) - excludeNamespacesContainersRefreshInterval = defaultExcludeNamespacesContainersRefreshInterval - } - Log("excludeNamespacesContainersRefreshInterval = %d \n", excludeNamespacesContainersRefreshInterval) - ExcludeNamespacesContainersRefreshTicker = time.NewTicker(time.Second * time.Duration(excludeNamespacesContainersRefreshInterval)) // Populate Computer field containerHostName, err := ioutil.ReadFile(pluginConfig["container_host_file_path"]) @@ -732,15 +681,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateHTTPClient() - if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { - defaultExcludePath := os.Getenv("AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH") - //further optimization for clusters with default settings. need this cache only when log collection config is overridden with custom config - if ( (strings.Compare(defaultExcludePath, "*_kube-system_*.log") != 0) ) { - go updateExcludeStdoutContainerIDs() - go updateExcludeStderrContainerIDs() - } - go updateContainerImageNameMaps() - } else { - Log("Running in replicaset. Disabling kube-system container cache collection & updates \n") + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + populateExcludedStdoutNamespaces() + populateExcludedStderrNamespaces() + go updateContainerImageNameMaps() + } else { + Log("Running in replicaset. Disabling container enrichment caching & updates \n") } } diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 2ee6f994d..0fa2ddd4b 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -74,7 +74,6 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // FLBPluginExit exits the plugin func FLBPluginExit() int { ContainerLogTelemetryTicker.Stop() - ExcludeNamespacesContainersRefreshTicker.Stop() ContainerImageNameRefreshTicker.Stop() return output.FLB_OK } From f87349eafa96160a3d3c0bf81f80a8c98064b3e3 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Sun, 9 Jun 2019 20:00:21 -0700 Subject: [PATCH 095/160] merge final changes for release from Vishwa/june2019agentrel to ci_feature (#224) * * derive k8s namespace from file (rather than making a api call) * optimize perf by not tailing excluded namespaces in stdout & stderr * Tuning fluentbit settings based on Cortana teams findings * making db sync off * buffer chunk and max as 1m so that we dont flush > 1m payloads * increasing rotatte wait from 5 secs to 30 secs * decreasing refresh interval from 60 secs to 30 secs * adding retry limit as 10 so that items get dropped in 50 secs rather than infinetely trying * changing flush to 5 secs from 30 secs * fix a minor comment * * change flush from 5 to 10 secs based on perf findings --- installer/conf/td-agent-bit.conf | 2 +- installer/scripts/tomlparser.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 14728af5d..3d51154e7 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -1,5 +1,5 @@ [SERVICE] - Flush 5 + Flush 10 Log_Level info Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index abc8b8e19..3e7f48045 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -120,8 +120,8 @@ def populateSettingValuesFromConfigMap(parsedConfig) end @configSchemaVersion = ENV['AZMON_AGENT_CFG_SCHEMA_VERSION'] + puts "****************Start Config Processing********************" if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp('v1') == 0 #note v1 is the only supported schema version , so hardcoding it - puts "****************Start Config Processing********************" configMapSettings = parseConfigMap if !configMapSettings.nil? populateSettingValuesFromConfigMap(configMapSettings) From 8a412c19c935035a13664eec8672e2af141be37b Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Fri, 14 Jun 2019 09:05:36 -0700 Subject: [PATCH 096/160] fix fluent bit tuning for perf run (#226) * fix fluent bit tuning for perf run * stop collecting our own partition --- installer/conf/td-agent-bit.conf | 14 ++++++++------ installer/conf/telegraf.conf | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index d4a49a385..2dee26234 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -1,5 +1,5 @@ [SERVICE] - Flush 10 + Flush 15 Log_Level info Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log @@ -11,7 +11,7 @@ DB /var/log/omsagent-fblogs.db DB.Sync Off Parser docker - Mem_Buf_Limit 5m + Mem_Buf_Limit 10m Buffer_Chunk_Size 1m Buffer_Max_Size 1m Rotate_Wait 20 @@ -26,20 +26,22 @@ Tag oms.container.log.flbplugin.* Path /var/log/containers/omsagent*.log DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db - Mem_Buf_Limit 2m + DB.Sync Off + Mem_Buf_Limit 1m Path_Key filepath Skip_Long_Lines On - Ignore_Older 5m + Ignore_Older 2m [INPUT] Name tail Tag oms.container.log.telegraf.err.* Path /var/opt/microsoft/docker-cimprov/log/telegraf.log DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - Mem_Buf_Limit 2m + DB.Sync Off + Mem_Buf_Limit 1m Path_Key filepath Skip_Long_Lines On - Ignore_Older 5m + Ignore_Older 2m [INPUT] Name tcp diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 6b3f44929..06b1c55eb 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -404,7 +404,7 @@ # Below due to Bug - https://github.com/influxdata/telegraf/issues/5615 # ORDER matters here!! - i.e the below should be the LAST modifier [inputs.disk.tagdrop] - path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers"] + path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers", "/etc/config/settings"] # Read metrics about memory usage From e36b5ab1600fccfb9cad1fe1b07aa95f2f1171d7 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Fri, 14 Jun 2019 09:30:04 -0700 Subject: [PATCH 097/160] fix merge issue --- source/code/go/src/plugins/oms.go | 69 ------------------------------- 1 file changed, 69 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 4e6cd4d88..b925e7145 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -243,75 +243,6 @@ func populateExcludedStderrNamespaces() { StderrIgnoreNsSet[strings.TrimSpace(ns)] = true } } - - var metrics []laTelegrafMetric - var i int - - for i = 0; i < len(laMetrics); i++ { - metrics = append(metrics, *laMetrics[i]) - } - - laTelegrafMetrics := InsightsMetricsBlob{ - DataType: InsightsMetricsDataType, - IPName: IPName, - DataItems: metrics} - - jsonBytes, err := json.Marshal(laTelegrafMetrics) - - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) - Log(message) - SendException(message) - return output.FLB_OK - } - - //Post metrics data to LA - req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) - - //req.URL.Query().Add("api-version","2016-04-01") - - //set headers - req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) - - //expensive to do string len for every request, so use a flag - if ResourceCentric == true { - req.Header.Set("x-ms-AzureResourceId", ResourceID) - } - - start := time.Now() - resp, err := HTTPClient.Do(req) - elapsed := time.Since(start) - - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) - Log(message) - SendException(message) - UpdateNumTelegrafMetricsSentTelemetry(0, 1) - return output.FLB_RETRY - } - - if resp == nil || resp.StatusCode != 200 { - if resp != nil { - Log("PostTelegrafMetricsToLA::Error:(retriable) Response Status %v Status Code %v", resp.Status, resp.StatusCode) - } - UpdateNumTelegrafMetricsSentTelemetry(0, 1) - return output.FLB_RETRY - } - - defer resp.Body.Close() - - numMetrics := len(laMetrics) - UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0) - Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) - - return output.FLB_OK -} - -func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int) { - ContainerLogTelemetryMutex.Lock() - TelegrafMetricsSentCount += float64(numMetricsSent) - TelegrafMetricsSendErrorCount += float64(numSendErrors) - ContainerLogTelemetryMutex.Unlock() } //Azure loganalytics metric values have to be numeric, so string values are dropped From 8ba1f86953ef3c666023564e1ad7ad57fffb584e Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 21 Jun 2019 12:10:34 -0700 Subject: [PATCH 098/160] add release notes for june release in ci_feature branch --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index d6ac07e33..32ed42929 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,26 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +##### Version microsoft/oms:ciprod06142019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019 +- MDM pod metrics bug fixes - MDM rejecting pod metrics due to nodename or controllername dimensions being empty +- Prometheus metrics collection by default in every node for kubelet docker operations and kubelet docker operation errors +- Telegraf metric collection for diskio and networkio metrics +- Agent Configuration/ Settings for data collection + * Cluster level log collection enable/disable option + * Ability to enable/disable stdout and/or stderr logs collection per namespace + * Cluster level environment variable collection enable/disable option + * Config file version & config schema version + * Pod annotation for supported config schema version(s) +- Log collection optimization/tuning for better performance + * Derive k8s namespaces from log file name (instead of making call to k8s api service) + * Do not tail log files for containers in the excluded namespace list (if excluded both in stdout & stderr) + * Limit buffer size to 1M and flush logs more frequently [every 10 secs (instead of 30 secs)] + * Tuning of several other fluent bit settings +- Increase requests + * Replica set memory request by 75M (100M to 175M) + * Daemonset CPU request by 25m (50m to 75m) +- Will be pushing image only to MCR ( no more Docker) starting this release. AKS-engine will also start to pull our agent image from MCR + ### 04/23/2019 - ##### Version microsoft/oms:ciprod043232019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04232019 - Windows node monitoring (metrics & inventory) From e7e9e6d73808e15566e5d56f00f543f777112678 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 21 Jun 2019 12:12:27 -0700 Subject: [PATCH 099/160] fix title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 32ed42929..3a12a521b 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -##### Version microsoft/oms:ciprod06142019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019 +### 06/14/2019 - Version microsoft/oms:ciprod06142019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019 - MDM pod metrics bug fixes - MDM rejecting pod metrics due to nodename or controllername dimensions being empty - Prometheus metrics collection by default in every node for kubelet docker operations and kubelet docker operation errors - Telegraf metric collection for diskio and networkio metrics From 3903a9dc972fd6b6323a968cb8a5ede9707af722 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 21 Jun 2019 12:13:51 -0700 Subject: [PATCH 100/160] update --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3a12a521b..5dfc12f28 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,8 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 06/14/2019 - Version microsoft/oms:ciprod06142019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019 +### 06/14/2019 +##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019 - MDM pod metrics bug fixes - MDM rejecting pod metrics due to nodename or controllername dimensions being empty - Prometheus metrics collection by default in every node for kubelet docker operations and kubelet docker operation errors - Telegraf metric collection for diskio and networkio metrics From f5b54fed0b05e4546e310d31c00bb872a3c1cac2 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 21 Jun 2019 12:14:47 -0700 Subject: [PATCH 101/160] fix title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5dfc12f28..759ec476d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) ### 06/14/2019 -##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019 +##### Version microsoft/oms:ciprod06142019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019 - MDM pod metrics bug fixes - MDM rejecting pod metrics due to nodename or controllername dimensions being empty - Prometheus metrics collection by default in every node for kubelet docker operations and kubelet docker operation errors - Telegraf metric collection for diskio and networkio metrics From 1d32cec35bf1b4441484080b5f18b0d54d64c49d Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 5 Jul 2019 10:05:29 -0700 Subject: [PATCH 102/160] Trim spaces in AKS_REGION (#233) This is not an issue for normal AKS Monitoring Addon Onboarding. ONLY an issue for backdoor onboarding --- source/code/plugin/out_mdm.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 68c43d5da..a81da0fbc 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -67,7 +67,9 @@ def start return end - @@post_request_url = @@post_request_url_template % {aks_region: aks_region, aks_resource_id: aks_resource_id} + aks_region = aks_region.gsub(" ","") + + @@post_request_url = @@post_request_url_template % {aks_region: aks_region), aks_resource_id: aks_resource_id} @post_request_uri = URI.parse(@@post_request_url) @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) @http_client.use_ssl = true From 5b8c52eff693da7eee9f6adcaae567108b38cad0 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 9 Jul 2019 11:17:10 -0700 Subject: [PATCH 103/160] Add Logs Size To Telemetry (#234) * Add Logs to telemetry * Using len instead of unsafe.Sizeof --- source/code/go/src/plugins/oms.go | 17 ++++++++-------- source/code/go/src/plugins/telemetry.go | 26 ++++++++++++++++--------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index b925e7145..a79297189 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -226,7 +226,7 @@ func populateExcludedStdoutNamespaces() { if (strings.Compare(collectStdoutLogs, "true") == 0) && (len(excludeList) > 0) { stdoutNSExcludeList = strings.Split(excludeList, ",") for _, ns := range stdoutNSExcludeList { - Log ("Excluding namespace %s for stdout log collection", ns) + Log("Excluding namespace %s for stdout log collection", ns) StdoutIgnoreNsSet[strings.TrimSpace(ns)] = true } } @@ -239,7 +239,7 @@ func populateExcludedStderrNamespaces() { if (strings.Compare(collectStderrLogs, "true") == 0) && (len(excludeList) > 0) { stderrNSExcludeList = strings.Split(excludeList, ",") for _, ns := range stderrNSExcludeList { - Log ("Excluding namespace %s for stderr log collection", ns) + Log("Excluding namespace %s for stderr log collection", ns) StderrIgnoreNsSet[strings.TrimSpace(ns)] = true } } @@ -425,7 +425,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { nameIDMap := make(map[string]string) DataUpdateMutex.Lock() - + for k, v := range ImageIDMap { imageIDMap[k] = v } @@ -476,6 +476,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { Name: stringMap["Name"], } + FlushedRecordsSize += float64(len(stringMap["LogEntry"])) + dataItems = append(dataItems, dataItem) loggedTime, e := time.Parse(time.RFC3339, dataItem.LogEntryTimeStamp) if e != nil { @@ -561,7 +563,7 @@ func GetContainerIDK8sNamespaceFromFileName(filename string) (string, string) { start := strings.LastIndex(filename, "-") end := strings.LastIndex(filename, ".") - + if start >= end || start == -1 || end == -1 { id = "" } else { @@ -641,7 +643,6 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) ContainerImageNameRefreshTicker = time.NewTicker(time.Second * time.Duration(containerInventoryRefreshInterval)) - // Populate Computer field containerHostName, err := ioutil.ReadFile(pluginConfig["container_host_file_path"]) if err != nil { @@ -680,11 +681,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateHTTPClient() - if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() - go updateContainerImageNameMaps() - } else { + go updateContainerImageNameMaps() + } else { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 956ebf07e..5fc0fa843 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -16,6 +16,8 @@ import ( var ( // FlushedRecordsCount indicates the number of flushed log records in the current period FlushedRecordsCount float64 + // FlushedRecordsSize indicates the size of the flushed records in the current period + FlushedRecordsSize float64 // FlushedRecordsTimeTaken indicates the cumulative time taken to flush the records for the current period FlushedRecordsTimeTaken float64 // This is telemetry for how old/latent logs we are processing in milliseconds (max over a period of time) @@ -35,16 +37,17 @@ var ( ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" - metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" + metricNameLogSize = "ContainerLogsSize" + metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" - metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" defaultTelemetryPushIntervalSeconds = 300 @@ -71,11 +74,13 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogTelemetryMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 logRate := FlushedRecordsCount / float64(elapsed/time.Second) + logSizeRate := FlushedRecordsSize / float64(elapsed/time.Second) telegrafMetricsSentCount := TelegrafMetricsSentCount telegrafMetricsSendErrorCount := TelegrafMetricsSendErrorCount TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 FlushedRecordsCount = 0.0 + FlushedRecordsSize = 0.0 FlushedRecordsTimeTaken = 0.0 logLatencyMs := AgentLogProcessingMaxLatencyMs logLatencyMsContainer := AgentLogProcessingMaxLatencyMsContainer @@ -88,7 +93,10 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) TelemetryClient.Track(flushRateMetric) logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) TelemetryClient.Track(logRateMetric) + Log("Log Size Rate: %f\n", logSizeRate) + TelemetryClient.Track(logSizeMetric) logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) logLatencyMetric.Properties["Container"] = logLatencyMsContainer TelemetryClient.Track(logLatencyMetric) From 5fc0f1b49b6cdd04b9f8adddadbfa6a6bc1f73f5 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 9 Jul 2019 15:10:12 -0700 Subject: [PATCH 104/160] Merge Vishwa/promcustommetrics to ci_feature (#237) * hard code config for UST CCP team * fix config * fix config after discussion * fix error log to get errros * fix config * update config * Add telemetry * Rashmi/promcustomconfig (#231) * changes * formatting changes * changes * changes * changes * changes * changes * changes * changes * changes * adding telemetry * changes * changes * changes * changes * changes * changes * changes * cahnges * changes * Rashmi/promcustomconfig (#236) * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * fix exceptions * changes to remove some exceptions * exception fixes * changes * changes for poduid nil check --- installer/conf/td-agent-bit-rs.conf | 10 - installer/conf/td-agent-bit.conf | 22 +- installer/conf/telegraf-rs.conf | 95 +++-- installer/conf/telegraf.conf | 88 +++-- installer/datafiles/base_container.data | 3 +- .../scripts/tomlparser-prom-customconfig.rb | 200 +++++++++++ installer/scripts/tomlparser.rb | 82 ++--- source/code/go/src/plugins/oms.go | 9 +- source/code/go/src/plugins/out_oms.go | 2 - .../code/plugin/CAdvisorMetricsAPIClient.rb | 34 +- source/code/plugin/DockerApiClient.rb | 325 +++++++++--------- source/code/plugin/KubernetesApiClient.rb | 2 +- source/code/plugin/in_containerinventory.rb | 5 +- source/code/plugin/in_kube_events.rb | 138 ++++---- source/code/plugin/in_kube_nodes.rb | 134 ++++---- source/code/plugin/in_kube_podinventory.rb | 18 +- source/code/plugin/in_kube_services.rb | 191 +++++----- 17 files changed, 833 insertions(+), 525 deletions(-) create mode 100644 installer/scripts/tomlparser-prom-customconfig.rb diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf index 7945261aa..7839b0eee 100644 --- a/installer/conf/td-agent-bit-rs.conf +++ b/installer/conf/td-agent-bit-rs.conf @@ -4,16 +4,6 @@ Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log -[INPUT] - Name tail - Tag oms.container.log.telegraf.err.* - Path /var/opt/microsoft/docker-cimprov/log/telegraf.log - DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - Mem_Buf_Limit 2m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 5m - [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 2dee26234..e7aabd242 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -6,7 +6,7 @@ [INPUT] Name tail - Tag oms.container.log.* + Tag oms.container.log.la.* Path ${AZMON_LOG_TAIL_PATH} DB /var/log/omsagent-fblogs.db DB.Sync Off @@ -32,17 +32,6 @@ Skip_Long_Lines On Ignore_Older 2m -[INPUT] - Name tail - Tag oms.container.log.telegraf.err.* - Path /var/opt/microsoft/docker-cimprov/log/telegraf.log - DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - DB.Sync Off - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - [INPUT] Name tcp Tag oms.container.perf.telegraf.* @@ -53,9 +42,16 @@ [FILTER] Name grep - Match oms.container.log.* + Match oms.container.log.la.* Exclude stream ${AZMON_LOG_EXCLUSION_REGEX_PATTERN} +# Exclude prometheus plugin exceptions that might be caused due to invalid config.(Logs which contain - E! [inputs.prometheus]) +# Excluding these logs from being sent to AI since it can result in high volume of data in telemetry due to invalid config. +[FILTER] + Name grep + Match oms.container.log.flbplugin.* + Exclude log E! [\[]inputs.prometheus[\]] + [OUTPUT] Name oms EnableTelemetry true diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index cb9a36685..ce60bfa04 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -77,7 +77,7 @@ ## Run telegraf in quiet mode (error log messages only). quiet = true ## Specify the log file name. The empty string means to log to stderr. - logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + logfile = "" ## Override default hostname, if empty use os.Hostname() #hostname = "placeholder_hostname" @@ -536,32 +536,75 @@ #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] # [inputs.prometheus.tagpass] -[[inputs.exec]] - ## Commands array - interval = "15m" - commands = [ - "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" - ] +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_RS_PROM_INTERVAL" - ## Timeout for each command to complete. - timeout = "15s" + ## An array of urls to scrape metrics from. + urls = $AZMON_RS_PROM_URLS + + ## An array of Kubernetes services to scrape metrics from. + kubernetes_services = $AZMON_RS_PROM_K8S_SERVICES + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + monitor_kubernetes_pods = $AZMON_RS_PROM_MONITOR_PODS - ## measurement name suffix (for separating different commands) - name_suffix = "_telemetry" + fieldpass = $AZMON_RS_PROM_FIELDPASS + fielddrop = $AZMON_RS_PROM_FIELDDROP - ## Data format to consume. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md - data_format = "influx" - #tagexclude = ["hostName"] - [inputs.exec.tags] - AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" - ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" - Region = "$TELEMETRY_AKS_REGION" - ClusterName = "$TELEMETRY_CLUSTER_NAME" - ClusterType = "$TELEMETRY_CLUSTER_TYPE" - Computer = "placeholder_hostname" - ControllerType = "$CONTROLLER_TYPE" + metric_version = 2 + url_tag = "scrapeUrl" + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + +# [[inputs.exec]] +# ## Commands array +# interval = "15m" +# commands = [ +# "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" +# ] + +# ## Timeout for each command to complete. +# timeout = "15s" + +# ## measurement name suffix (for separating different commands) +# name_suffix = "_telemetry" + +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# #tagexclude = ["hostName"] +# [inputs.exec.tags] +# AgentVersion = "$AGENT_VERSION" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" +# ClusterName = "$TELEMETRY_CLUSTER_NAME" +# ClusterType = "$TELEMETRY_CLUSTER_TYPE" +# Computer = "placeholder_hostname" +# ControllerType = "$CONTROLLER_TYPE" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 06b1c55eb..4883de81b 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -77,8 +77,7 @@ ## Run telegraf in quiet mode (error log messages only). quiet = true ## Specify the log file name. The empty string means to log to stderr. - logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" - + logfile = "" ## Override default hostname, if empty use os.Hostname() #hostname = "placeholder_hostname" ## If set to true, do no set the "host" tag in the telegraf agent. @@ -568,31 +567,66 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] -[[inputs.exec]] - ## Commands array - interval = "15m" - commands = [ - "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" - ] - ## Timeout for each command to complete. - timeout = "15s" +## prometheus custom metrics +[[inputs.prometheus]] - ## measurement name suffix (for separating different commands) - name_suffix = "_telemetry" + interval = "$AZMON_DS_PROM_INTERVAL" - ## Data format to consume. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md - data_format = "influx" - tagexclude = ["hostName"] - [inputs.exec.tags] - AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" - ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" - Region = "$TELEMETRY_AKS_REGION" - ClusterName = "$TELEMETRY_CLUSTER_NAME" - ClusterType = "$TELEMETRY_CLUSTER_TYPE" - Computer = "placeholder_hostname" - ControllerType = "$CONTROLLER_TYPE" \ No newline at end of file + ## An array of urls to scrape metrics from. + urls = $AZMON_DS_PROM_URLS + + fieldpass = $AZMON_DS_PROM_FIELDPASS + + fielddrop = $AZMON_DS_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + +# [[inputs.exec]] +# ## Commands array +# interval = "15m" +# commands = [ +# "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" +# ] + +# ## Timeout for each command to complete. +# timeout = "15s" + +# ## measurement name suffix (for separating different commands) +# name_suffix = "_telemetry" + +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# tagexclude = ["hostName"] +# [inputs.exec.tags] +# AgentVersion = "$AGENT_VERSION" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" +# ClusterName = "$TELEMETRY_CLUSTER_NAME" +# ClusterType = "$TELEMETRY_CLUSTER_TYPE" +# Computer = "placeholder_hostname" +# ControllerType = "$CONTROLLER_TYPE" \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 58a74aa0a..fe1635335 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -110,9 +110,10 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root -/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root +/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root diff --git a/installer/scripts/tomlparser-prom-customconfig.rb b/installer/scripts/tomlparser-prom-customconfig.rb new file mode 100644 index 000000000..d9fdf1cc2 --- /dev/null +++ b/installer/scripts/tomlparser-prom-customconfig.rb @@ -0,0 +1,200 @@ +#!/usr/local/bin/ruby + +require_relative "tomlrb" +require "fileutils" + +@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" +@replicaset = "replicaset" +@daemonset = "daemonset" +@configSchemaVersion = "" +@defaultDsInterval = "1m" +@defaultDsPromUrls = [] +@defaultDsFieldPass = [] +@defaultDsFieldDrop = [] +@defaultRsInterval = "1m" +@defaultRsPromUrls = [] +@defaultRsFieldPass = [] +@defaultRsFieldDrop = [] +@defaultRsK8sServices = [] +@defaultRsMonitorPods = false + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@promConfigMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" + parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted prometheus config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" + return nil + end + rescue => errorStr + puts "config::error::Exception while parsing toml config file for prometheus config: #{errorStr}, using defaults" + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))) + return true + else + return false + end +end + +def checkForType(variable, varType) + if variable.nil? || variable.kind_of?(varType) + return true + else + return false + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + # Checking to see if this is the daemonset or replicaset to parse config accordingly + controller = ENV["CONTROLLER_TYPE"] + if !controller.nil? + if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? + if controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus replicaset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] + kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(kubernetesServices, String) && + checkForTypeArray(urls, String) && + !monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for replicaset" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultRsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop + kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices + urls = (urls.nil?) ? @defaultRsPromUrls : urls + monitorKubernetesPods = (kubernetesServices.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods + + file_name = "/opt/telegraf-test-rs.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", (monitorKubernetesPods ? "true" : "false")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") + file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for replicaset" + end + else + puts "config::Typecheck failed for prometheus config settings for replicaset, using defaults" + end # end of type check condition + rescue => errorStr + puts "config::error::Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults" + setRsPromDefaults + puts "****************End Prometheus Config Processing********************" + end + elsif controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? + #Get prometheus daemonset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(urls, String) + puts "config::Successfully passed typecheck for config settings for daemonset" + + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultDsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop + urls = (urls.nil?) ? @defaultDsPromUrls : urls + + file_name = "/opt/telegraf-test.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" + + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for daemonset" + end + else + puts "config::Typecheck failed for prometheus config settings for daemonset, using defaults" + end # end of type check condition + rescue => errorStr + puts "config::error::Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults" + puts "****************End Prometheus Config Processing********************" + end + end # end of controller type check + end + else + puts "config::error:: Controller undefined while processing prometheus config, using defaults" + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Prometheus Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@promConfigMapMountPath)) + puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" + else + puts "config::No configmap mounted for prometheus custom config, using defaults" + end +end +puts "****************End Prometheus Config Processing********************" diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index 3e7f48045..c72e64127 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -82,7 +82,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) if @collectStderrLogs && !stderrNamespaces.nil? if stderrNamespaces.kind_of?(Array) if !@stdoutExcludeNamespaces.nil? && !@stdoutExcludeNamespaces.empty? - stdoutNamespaces = @stdoutExcludeNamespaces.split(',') + stdoutNamespaces = @stdoutExcludeNamespaces.split(",") end # Checking only for the first element to be string because toml enforces the arrays to contain elements of same type if stderrNamespaces.length > 0 && stderrNamespaces[0].kind_of?(String) @@ -119,47 +119,47 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end - @configSchemaVersion = ENV['AZMON_AGENT_CFG_SCHEMA_VERSION'] - puts "****************Start Config Processing********************" - if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp('v1') == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end - else - if (File.file?(@configMapMountPath)) - puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" - end - @excludePath = "*_kube-system_*.log" +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" end + @excludePath = "*_kube-system_*.log" +end - # Write the settings to file, so that they can be set as environment variables - file = File.open("config_env_var", "w") +# Write the settings to file, so that they can be set as environment variables +file = File.open("config_env_var", "w") - if !file.nil? - # This will be used in td-agent-bit.conf file to filter out logs - if (!@collectStdoutLogs && !@collectStderrLogs) - #Stop log tailing completely - @logTailPath = "/opt/nolog*.log" - @logExclusionRegexPattern = "stdout|stderr" - elsif !@collectStdoutLogs - @logExclusionRegexPattern = "stdout" - elsif !@collectStderrLogs - @logExclusionRegexPattern = "stderr" - end - file.write("export AZMON_COLLECT_STDOUT_LOGS=#{@collectStdoutLogs}\n") - file.write("export AZMON_LOG_TAIL_PATH=#{@logTailPath}\n") - file.write("export AZMON_LOG_EXCLUSION_REGEX_PATTERN=\"#{@logExclusionRegexPattern}\"\n") - file.write("export AZMON_STDOUT_EXCLUDED_NAMESPACES=#{@stdoutExcludeNamespaces}\n") - file.write("export AZMON_COLLECT_STDERR_LOGS=#{@collectStderrLogs}\n") - file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") - file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") - file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") - # Close file after writing all environment variables - file.close - puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " - puts "****************End Config Processing********************" - else - puts "config::error::Exception while opening file for writing config environment variables" - puts "****************End Config Processing********************" +if !file.nil? + # This will be used in td-agent-bit.conf file to filter out logs + if (!@collectStdoutLogs && !@collectStderrLogs) + #Stop log tailing completely + @logTailPath = "/opt/nolog*.log" + @logExclusionRegexPattern = "stdout|stderr" + elsif !@collectStdoutLogs + @logExclusionRegexPattern = "stdout" + elsif !@collectStderrLogs + @logExclusionRegexPattern = "stderr" end + file.write("export AZMON_COLLECT_STDOUT_LOGS=#{@collectStdoutLogs}\n") + file.write("export AZMON_LOG_TAIL_PATH=#{@logTailPath}\n") + file.write("export AZMON_LOG_EXCLUSION_REGEX_PATTERN=\"#{@logExclusionRegexPattern}\"\n") + file.write("export AZMON_STDOUT_EXCLUDED_NAMESPACES=#{@stdoutExcludeNamespaces}\n") + file.write("export AZMON_COLLECT_STDERR_LOGS=#{@collectStderrLogs}\n") + file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") + file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") + file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") + # Close file after writing all environment variables + file.close + puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " + puts "****************End Config Processing********************" +else + puts "config::error::Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" +end diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index a79297189..c5ad307d8 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -34,14 +34,12 @@ const ResourceIdEnv = "AKS_RESOURCE_ID" //env variable which has ResourceName for NON-AKS const ResourceNameEnv = "ACS_RESOURCE_NAME" -// Origin prefix for telegraf Metrics (used as prefix for origin field & prefix for azure monitor specific tags) +// Origin prefix for telegraf Metrics (used as prefix for origin field & prefix for azure monitor specific tags and also for custom-metrics telemetry ) const TelegrafMetricOriginPrefix = "container.azm.ms" // Origin suffix for telegraf Metrics (used as suffix for origin field) const TelegrafMetricOriginSuffix = "telegraf" -// Namespace prefix for telegraf Metrics (used as prefix for Namespace field) -//const TelegrafMetricNamespacePrefix = "plugin" // clusterName tag const TelegrafTagClusterName = "clusterName" @@ -193,7 +191,6 @@ func updateContainerImageNameMaps() { if err != nil { message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) Log(message) - SendException(message) continue } @@ -384,7 +381,6 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if err != nil { message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) Log(message) - SendException(message) UpdateNumTelegrafMetricsSentTelemetry(0, 1) return output.FLB_RETRY } @@ -519,7 +515,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if err != nil { message := fmt.Sprintf("Error when sending request %s \n", err.Error()) Log(message) - SendException(message) + // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation + //SendException(message) Log("Failed to flush %d records after %s", len(dataItems), elapsed) return output.FLB_RETRY diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 0fa2ddd4b..e9e7124b7 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -64,8 +64,6 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { return PushToAppInsightsTraces(records, appinsights.Information, incomingTag) } else if strings.Contains(incomingTag, "oms.container.perf.telegraf") { return PostTelegrafMetricsToLA(records) - } else if strings.Contains(incomingTag, "oms.container.log.telegraf.err") { - return PushToAppInsightsTraces(records, appinsights.Error, incomingTag) } return PostDataHelper(records) diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index b842edb29..ec38bcbb5 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -14,12 +14,31 @@ class CAdvisorMetricsAPIClient require_relative "ApplicationInsightsUtility" @configMapMountPath = "/etc/config/settings/log-data-collection-settings" + @promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] + + @rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] + @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] + + @rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] + @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] + + @rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] + @dsPromFieldDropCount = ENV["TELEMETRY_DS_PROM_FIELDDROP_LENGTH"] + + @rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] + + @rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] + @dsPromUrlCount = ENV["TELEMETRY_DS_PROM_URLS_LENGTH"] + + @rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] + + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil @@ -199,7 +218,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["PodName"] = podName telemetryProps["ContainerName"] = containerName telemetryProps["Computer"] = hostName - #telemetry about custom log collections setting + #telemetry about log collections settings if (File.file?(@configMapMountPath)) telemetryProps["clustercustomsettings"] = true telemetryProps["clusterenvvars"] = @clusterEnvVarCollectionEnabled @@ -209,6 +228,19 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["clusterLogTailPath"] = @clusterLogTailPath telemetryProps["clusterAgentSchemaVersion"] = @clusterAgentSchemaVersion end + #telemetry about prometheus metric collections settings + if (File.file?(@promConfigMountPath)) + telemetryProps["rsPromInt"] = @rsPromInterval + telemetryProps["dsPromInt"] = @dsPromInterval + telemetryProps["rsPromFPC"] = @rsPromFieldPassCount + telemetryProps["dsPromFPC"] = @dsPromFieldPassCount + telemetryProps["rsPromFDC"] = @rsPromFieldDropCount + telemetryProps["dsPromFDC"] = @dsPromFieldDropCount + telemetryProps["rsPromServ"] = @rsPromK8sServiceCount + telemetryProps["rsPromUrl"] = @rsPromUrlCount + telemetryProps["dsPromUrl"] = @dsPromUrlCount + telemetryProps["rsPromMonPods"] = @rsPromMonitorPods + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index 5a46b5fdb..eb9d74531 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -2,179 +2,196 @@ # frozen_string_literal: true class DockerApiClient + require "socket" + require "json" + require "timeout" + require_relative "omslog" + require_relative "DockerApiRestHelper" + require_relative "ApplicationInsightsUtility" - require 'socket' - require 'json' - require 'timeout' - require_relative 'omslog' - require_relative 'DockerApiRestHelper' - require_relative 'ApplicationInsightsUtility' + @@SocketPath = "/var/run/host/docker.sock" + @@ChunkSize = 4096 + @@TimeoutInSeconds = 5 + @@PluginName = "ContainerInventory" - @@SocketPath = "/var/run/host/docker.sock" - @@ChunkSize = 4096 - @@TimeoutInSeconds = 5 - @@PluginName = 'ContainerInventory' + def initialize + end - def initialize - end - - class << self - # Make docker socket call for requests - def getResponse(request, isMultiJson, isVersion) - begin - socket = UNIXSocket.new(@@SocketPath) - dockerResponse = "" - isTimeOut = false - socket.write(request) - # iterate through the response until the last chunk is less than the chunk size so that we can read all data in socket. - loop do - begin - responseChunk = "" - timeout(@@TimeoutInSeconds) do - responseChunk = socket.recv(@@ChunkSize) - end - dockerResponse += responseChunk - rescue Timeout::Error - $log.warn("Socket read timedout for request: #{request} @ #{Time.now.utc.iso8601}") - isTimeOut = true - break - end - break if (isVersion)? (responseChunk.length < @@ChunkSize) : (responseChunk.end_with? "0\r\n\r\n") - end - socket.close - return (isTimeOut)? nil : parseResponse(dockerResponse, isMultiJson) - rescue => errorStr - $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + class << self + # Make docker socket call for requests + def getResponse(request, isMultiJson, isVersion) + begin + socket = UNIXSocket.new(@@SocketPath) + dockerResponse = "" + isTimeOut = false + socket.write(request) + # iterate through the response until the last chunk is less than the chunk size so that we can read all data in socket. + loop do + begin + responseChunk = "" + timeout(@@TimeoutInSeconds) do + responseChunk = socket.recv(@@ChunkSize) end + dockerResponse += responseChunk + rescue Timeout::Error + $log.warn("Socket read timedout for request: #{request} @ #{Time.now.utc.iso8601}") + isTimeOut = true + break + end + break if (isVersion) ? (responseChunk.length < @@ChunkSize) : (responseChunk.end_with? "0\r\n\r\n") end + socket.close + return (isTimeOut) ? nil : parseResponse(dockerResponse, isMultiJson) + rescue => errorStr + $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end - def parseResponse(dockerResponse, isMultiJson) - # Doing this because the response is in the raw format and includes headers. - # Need to do a regex match to extract the json part of the response - Anything between [{}] in response - parsedJsonResponse = nil - begin - jsonResponse = isMultiJson ? dockerResponse[/\[{.+}\]/] : dockerResponse[/{.+}/] - rescue => errorStr - $log.warn("Regex match for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") - end - begin - if jsonResponse != nil - parsedJsonResponse = JSON.parse(jsonResponse) - end - rescue => errorStr - $log.warn("Json parsing for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - return parsedJsonResponse - end + def parseResponse(dockerResponse, isMultiJson) + # Doing this because the response is in the raw format and includes headers. + # Need to do a regex match to extract the json part of the response - Anything between [{}] in response + parsedJsonResponse = nil + begin + jsonResponse = isMultiJson ? dockerResponse[/\[{.+}\]/] : dockerResponse[/{.+}/] + rescue => errorStr + $log.warn("Regex match for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + end + begin + if jsonResponse != nil + parsedJsonResponse = JSON.parse(jsonResponse) + end + rescue => errorStr + $log.warn("Json parsing for docker response failed: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return parsedJsonResponse + end + def getDockerHostName() + dockerHostName = "" + request = DockerApiRestHelper.restDockerInfo + response = getResponse(request, false, false) + if (response != nil) + dockerHostName = response["Name"] + end + return dockerHostName + end - def getDockerHostName() - dockerHostName = "" - request = DockerApiRestHelper.restDockerInfo - response = getResponse(request, false, false) - if (response != nil) - dockerHostName = response['Name'] + def listContainers() + ids = [] + request = DockerApiRestHelper.restDockerPs + containers = getResponse(request, true, false) + if !containers.nil? && !containers.empty? + containers.each do |container| + labels = (!container["Labels"].nil?) ? container["Labels"] : container["labels"] + if !labels.nil? + labelKeys = labels.keys + dockerTypeLabel = labelKeys.find { |k| "io.kubernetes.docker.type".downcase == k.downcase } + if !dockerTypeLabel.nil? + dockerTypeLabelValue = labels[dockerTypeLabel] + # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers + if !(dockerTypeLabelValue.downcase == "podsandbox".downcase) + # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that + # are created in the pods for ContainerInventory + keyValue = labelKeys.find { |k| "io.kubernetes.pod.uid".downcase == k.downcase } + if !labels[keyValue].nil? + ids.push(container["Id"]) + end + end end - return dockerHostName + end end + end + return ids + end - def listContainers() - ids = [] - request = DockerApiRestHelper.restDockerPs - containers = getResponse(request, true, false) - if !containers.nil? && !containers.empty? - containers.each do |container| - labels = (!container['Labels'].nil?)? container['Labels'] : container['labels'] - if !labels.nil? - labelKeys = labels.keys - dockerTypeLabel = labelKeys.find {|k| 'io.kubernetes.docker.type'.downcase == k.downcase} - if !dockerTypeLabel.nil? - dockerTypeLabelValue = labels[dockerTypeLabel] - # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers - if !(dockerTypeLabelValue.downcase == "podsandbox".downcase) - # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that - # are created in the pods for ContainerInventory - keyValue = labelKeys.find {|k| 'io.kubernetes.pod.uid'.downcase == k.downcase} - if !labels[keyValue].nil? - ids.push(container['Id']) - end - end - end - end - end - end - return ids + # This method splits the tag value into an array - repository, image, tag, repodigest-imageid + def getImageRepositoryImageTag(tagValue, digestValue) + result = ["", "", "", ""] + atLocation = nil + begin + if !digestValue.empty? + # digest is of the format - repo@sha256:imageid + atLocation = digestValue.index("@") + if !atLocation.nil? + result[3] = digestValue[(atLocation + 1)..-1] + end end - # This method splits the tag value into an array - repository, image and tag - def getImageRepositoryImageTag(tagValue) - result = ["", "", ""] - begin - if !tagValue.empty? - # Find delimiters in the string of format repository/image:imagetag - slashLocation = tagValue.index('/') - colonLocation = tagValue.index(':') - if !colonLocation.nil? - if slashLocation.nil? - # image:imagetag - result[1] = tagValue[0..(colonLocation-1)] - else - # repository/image:imagetag - result[0] = tagValue[0..(slashLocation-1)] - result[1] = tagValue[(slashLocation + 1)..(colonLocation - 1)] - end - result[2] = tagValue[(colonLocation + 1)..-1] - end - end - rescue => errorStr - $log.warn("Exception at getImageRepositoryImageTag: #{errorStr} @ #{Time.now.utc.iso8601}") + if !tagValue.empty? + # Find delimiters in the string of format repository/image:imagetag + slashLocation = tagValue.index("/") + colonLocation = tagValue.index(":") + if !colonLocation.nil? + if slashLocation.nil? + # image:imagetag + result[1] = tagValue[0..(colonLocation - 1)] + else + # repository/image:imagetag + result[0] = tagValue[0..(slashLocation - 1)] + result[1] = tagValue[(slashLocation + 1)..(colonLocation - 1)] end - return result + result[2] = tagValue[(colonLocation + 1)..-1] + end + elsif !digestValue.empty? + # Getting repo information from repodigests when repotags is empty + if !atLocation.nil? + result[0] = digestValue[0..(atLocation - 1)] + end end + rescue => errorStr + $log.warn("Exception at getImageRepositoryImageTag: #{errorStr} @ #{Time.now.utc.iso8601}") + end + return result + end - # Image is in the format repository/image:imagetag - This method creates a hash of image id and repository, image and tag - def getImageIdMap() - result = nil - begin - request = DockerApiRestHelper.restDockerImages - images = getResponse(request, true, false) - if !images.nil? && !images.empty? - result = {} - images.each do |image| - tagValue = "" - tags = image['RepoTags'] - if !tags.nil? && tags.kind_of?(Array) && tags.length > 0 - tagValue = tags[0] - end - idValue = image['Id'] - if !idValue.nil? - result[idValue] = getImageRepositoryImageTag(tagValue) - end - end - end - rescue => errorStr - $log.warn("Exception at getImageIdMap: #{errorStr} @ #{Time.now.utc.iso8601}") + # Image is in the format repository/image:imagetag - This method creates a hash of image id and repository, image and tag + def getImageIdMap() + result = nil + begin + request = DockerApiRestHelper.restDockerImages + images = getResponse(request, true, false) + if !images.nil? && !images.empty? + result = {} + images.each do |image| + tagValue = "" + tags = image["RepoTags"] + if !tags.nil? && tags.kind_of?(Array) && tags.length > 0 + tagValue = tags[0] + end + digestValue = "" + digests = image["RepoDigests"] + if !digests.nil? && digests.kind_of?(Array) && digests.length > 0 + digestValue = digests[0] + end + idValue = image["Id"] + if !idValue.nil? + result[idValue] = getImageRepositoryImageTag(tagValue, digestValue) end - return result + end end + rescue => errorStr + $log.warn("Exception at getImageIdMap: #{errorStr} @ #{Time.now.utc.iso8601}") + end + return result + end - def dockerInspectContainer(id) - request = DockerApiRestHelper.restDockerInspect(id) - return getResponse(request, false, false) - end + def dockerInspectContainer(id) + request = DockerApiRestHelper.restDockerInspect(id) + return getResponse(request, false, false) + end - # This method returns docker version and docker api version for telemetry - def dockerInfo() - request = DockerApiRestHelper.restDockerVersion - response = getResponse(request, false, true) - dockerInfo = {} - if (response != nil) - dockerInfo['Version'] = response['Version'] - dockerInfo['ApiVersion'] = response['ApiVersion'] - end - return dockerInfo - end + # This method returns docker version and docker api version for telemetry + def dockerInfo() + request = DockerApiRestHelper.restDockerVersion + response = getResponse(request, false, true) + dockerInfo = {} + if (response != nil) + dockerInfo["Version"] = response["Version"] + dockerInfo["ApiVersion"] = response["ApiVersion"] + end + return dockerInfo end + end end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 3c6b4f203..58a276cfd 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -57,7 +57,7 @@ def getKubeResourceInfo(resource) rescue => error @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") end - if (response.body.empty?) + if (!response.nil? && !response.body.nil? && response.body.empty?) @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") end return response diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index 05e5bc9ea..4392de280 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -170,12 +170,13 @@ def inspectContainer(id, nameMap, clusterCollectEnvironmentVar) end imageValue = container["Image"] if !imageValue.nil? && !imageValue.empty? - containerInstance["ImageId"] = imageValue repoImageTagArray = nameMap[imageValue] if nameMap.has_key? imageValue containerInstance["Repository"] = repoImageTagArray[0] containerInstance["Image"] = repoImageTagArray[1] containerInstance["ImageTag"] = repoImageTagArray[2] + # Setting the image id to the id in the remote repository + containerInstance["ImageId"] = repoImageTagArray[3] end end obtainContainerConfig(containerInstance, container, clusterCollectEnvironmentVar) @@ -200,7 +201,7 @@ def enumerate if !containerIds.empty? eventStream = MultiEventStream.new nameMap = DockerApiClient.getImageIdMap - clusterCollectEnvironmentVar = ENV['AZMON_CLUSTER_COLLECT_ENV_VAR'] + clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 $log.warn("Environment Variable collection disabled for cluster") end diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 309dd8034..3a0e04c67 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -2,27 +2,25 @@ # frozen_string_literal: true module Fluent - class Kube_Event_Input < Input - Plugin.register_input('kubeevents', self) + Plugin.register_input("kubeevents", self) @@KubeEventsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml" def initialize super - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'oms_common' - require_relative 'omslog' - require_relative 'ApplicationInsightsUtility' + require "json" + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.KubeEvents" - def configure (conf) + def configure(conf) super end @@ -46,63 +44,62 @@ def shutdown end def enumerate(eventList = nil) - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - if eventList.nil? - $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") - events = JSON.parse(KubernetesApiClient.getKubeResourceInfo('events').body) - $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") - else - events = eventList + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + if eventList.nil? + $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") + events = JSON.parse(KubernetesApiClient.getKubeResourceInfo("events").body) + $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") + else + events = eventList + end + eventQueryState = getEventQueryState + newEventQueryState = [] + begin + if (!events.empty? && !events["items"].nil?) + eventStream = MultiEventStream.new + events["items"].each do |items| + record = {} + # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + eventId = items["metadata"]["uid"] + "/" + items["count"].to_s + newEventQueryState.push(eventId) + if !eventQueryState.empty? && eventQueryState.include?(eventId) + next + end + record["ObjectKind"] = items["involvedObject"]["kind"] + record["Namespace"] = items["involvedObject"]["namespace"] + record["Name"] = items["involvedObject"]["name"] + record["Reason"] = items["reason"] + record["Message"] = items["message"] + record["Type"] = items["type"] + record["TimeGenerated"] = items["metadata"]["creationTimestamp"] + record["SourceComponent"] = items["source"]["component"] + record["FirstSeen"] = items["firstTimestamp"] + record["LastSeen"] = items["lastTimestamp"] + record["Count"] = items["count"] + if items["source"].key?("host") + record["Computer"] = items["source"]["host"] + else + record["Computer"] = (OMS::Common.get_hostname) + end + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + wrapper = { + "DataType" => "KUBE_EVENTS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper end - eventQueryState = getEventQueryState - newEventQueryState = [] - begin - if(!events.empty?) - eventStream = MultiEventStream.new - events['items'].each do |items| - record = {} - # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - eventId = items['metadata']['uid'] + "/" + items['count'].to_s - newEventQueryState.push(eventId) - if !eventQueryState.empty? && eventQueryState.include?(eventId) - next - end - record['ObjectKind']= items['involvedObject']['kind'] - record['Namespace'] = items['involvedObject']['namespace'] - record['Name'] = items['involvedObject']['name'] - record['Reason'] = items['reason'] - record['Message'] = items['message'] - record['Type'] = items['type'] - record['TimeGenerated'] = items['metadata']['creationTimestamp'] - record['SourceComponent'] = items['source']['component'] - record['FirstSeen'] = items['firstTimestamp'] - record['LastSeen'] = items['lastTimestamp'] - record['Count'] = items['count'] - if items['source'].key?('host') - record['Computer'] = items['source']['host'] - else - record['Computer'] = (OMS::Common.get_hostname) - end - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ClusterId'] = KubernetesApiClient.getClusterId - wrapper = { - "DataType"=>"KUBE_EVENTS_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - eventStream.add(emitTime, wrapper) if wrapper - end - router.emit_stream(@tag, eventStream) if eventStream - end - writeEventQueryState(newEventQueryState) - rescue => errorStr - $log.warn line.dump, error: errorStr.to_s - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + router.emit_stream(@tag, eventStream) if eventStream + end + writeEventQueryState(newEventQueryState) + rescue => errorStr + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end end def run_periodic @@ -135,7 +132,7 @@ def getEventQueryState eventQueryState.push(line.chomp) #puts will append newline which needs to be removed end end - rescue => errorStr + rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -145,20 +142,17 @@ def getEventQueryState def writeEventQueryState(eventQueryState) begin - if(!eventQueryState.nil? && !eventQueryState.empty?) + if (!eventQueryState.nil? && !eventQueryState.empty?) # No need to close file handle (f) due to block scope File.open(@@KubeEventsStateFile, "w") do |f| f.puts(eventQueryState) end end - rescue => errorStr + rescue => errorStr $log.warn $log.warn line.dump, error: errorStr.to_s $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end - end # Kube_Event_Input - end # module - diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index aabda441e..0310fa419 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -58,81 +58,83 @@ def enumerate if (!nodeInventory.empty?) eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new - #get node inventory - nodeInventory["items"].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] + if !nodeInventory["items"].nil? + #get node inventory + nodeInventory["items"].each do |items| + record = {} + # Sending records for ContainerNodeInventory + containerNodeInventoryRecord = {} + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Computer"] = items["metadata"]["name"] - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] - record["Labels"] = [items["metadata"]["labels"]] - record["Status"] = "" + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = items["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] + record["Labels"] = [items["metadata"]["labels"]] + record["Status"] = "" - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. - if items["status"].key?("conditions") && !items["status"]["conditions"].empty? - allNodeConditions = "" - items["status"]["conditions"].each do |condition| - if condition["status"] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition["type"] - else - allNodeConditions = condition["type"] + if items["status"].key?("conditions") && !items["status"]["conditions"].empty? + allNodeConditions = "" + items["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end + end + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] end end - #collect last transition to/from ready (no matter ready is true/false) - if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? - record["LastTransitionTimeReady"] = condition["lastTransitionTime"] + if !allNodeConditions.empty? + record["Status"] = allNodeConditions end end - if !allNodeConditions.empty? - record["Status"] = allNodeConditions - end - end - nodeInfo = items["status"]["nodeInfo"] - record["KubeletVersion"] = nodeInfo["kubeletVersion"] - record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] - containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] - dockerVersion = nodeInfo["containerRuntimeVersion"] - dockerVersion.slice! "docker://" - containerNodeInventoryRecord["DockerVersion"] = dockerVersion - # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryWrapper = { - "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + nodeInfo = items["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + dockerVersion = nodeInfo["containerRuntimeVersion"] + dockerVersion.slice! "docker://" + containerNodeInventoryRecord["DockerVersion"] = dockerVersion + # ContainerNodeInventory data for docker version and operating system. + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - # Adding telemetry to send node telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - properties["OperatingSystem"] = nodeInfo["operatingSystem"] - properties["DockerVersion"] = dockerVersion - capacityInfo = items["status"]["capacity"] - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - telemetrySent = true + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + # Adding telemetry to send node telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + properties = {} + properties["Computer"] = record["Computer"] + properties["KubeletVersion"] = record["KubeletVersion"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] + properties["DockerVersion"] = dockerVersion + capacityInfo = items["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + telemetrySent = true + end end end router.emit_stream(@tag, eventStream) if eventStream diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 79490ba7d..d0056fb14 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -152,8 +152,10 @@ def getContainerEnvironmentVariables(pod, clusterCollectEnvironmentVar) containerEnvArray.each do |envVarHash| envName = envVarHash["name"] envValue = envVarHash["value"] - envArrayElement = envName + "=" + envValue - envVarsArray.push(envArrayElement) + if !envName.nil? && !envValue.nil? + envArrayElement = envName + "=" + envValue + envVarsArray.push(envArrayElement) + end end end # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE @@ -201,7 +203,11 @@ def parse_and_emit_records(podInventory, serviceList) # instead of the actual poduid. Since this uid is not being surface into the UX # its ok to use this. # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] + if items["metadata"]["annotations"].nil? + next + else + podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] + end else podUid = items["metadata"]["uid"] end @@ -287,7 +293,11 @@ def parse_and_emit_records(podInventory, serviceList) record["ContainerID"] = "" end #keeping this as which is same as InstanceName in perf table - record["ContainerName"] = podUid + "/" + container["name"] + if podUid.nil? || container["name"].nil? + next + else + record["ContainerName"] = podUid + "/" + container["name"] + end #Pod restart count is a sumtotal of restart counts of individual containers #within the pod. The restart count of a container is maintained by kubernetes #itself in the form of a container label. diff --git a/source/code/plugin/in_kube_services.rb b/source/code/plugin/in_kube_services.rb index e1bb93f30..8b0a013e4 100644 --- a/source/code/plugin/in_kube_services.rb +++ b/source/code/plugin/in_kube_services.rb @@ -2,108 +2,101 @@ # frozen_string_literal: true module Fluent - - class Kube_Services_Input < Input - Plugin.register_input('kubeservices', self) - - def initialize - super - require 'yaml' - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'oms_common' - require_relative 'omslog' - require_relative 'ApplicationInsightsUtility' + class Kube_Services_Input < Input + Plugin.register_input("kubeservices", self) - end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.containerinsights.KubeServices" - - def configure (conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal + def initialize + super + require "yaml" + require "json" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.containerinsights.KubeServices" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + $log.info("in_kube_services::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") + serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + $log.info("in_kube_services::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + begin + if (!serviceList.empty?) + eventStream = MultiEventStream.new + serviceList["items"].each do |items| + record = {} + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["ServiceName"] = items["metadata"]["name"] + record["Namespace"] = items["metadata"]["namespace"] + record["SelectorLabels"] = [items["spec"]["selector"]] + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterIP"] = items["spec"]["clusterIP"] + record["ServiceType"] = items["spec"]["type"] + # : Add ports and status fields + wrapper = { + "DataType" => "KUBE_SERVICES_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } - @thread.join + eventStream.add(emitTime, wrapper) if wrapper end + router.emit_stream(@tag, eventStream) if eventStream end - - def enumerate - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - $log.info("in_kube_services::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo('services').body) - $log.info("in_kube_services::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - begin - if(!serviceList.empty?) - eventStream = MultiEventStream.new - serviceList['items'].each do |items| - record = {} - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['ServiceName'] = items['metadata']['name'] - record['Namespace'] = items['metadata']['namespace'] - record['SelectorLabels'] = [items['spec']['selector']] - record['ClusterId'] = KubernetesApiClient.getClusterId - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ClusterIP'] = items['spec']['clusterIP'] - record['ServiceType'] = items['spec']['type'] - # : Add ports and status fields - wrapper = { - "DataType"=>"KUBE_SERVICES_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - eventStream.add(emitTime, wrapper) if wrapper - end - router.emit_stream(@tag, eventStream) if eventStream - end - rescue => errorStr - $log.warn line.dump, error: errorStr.to_s - $log.debug_backtrace(e.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kube_services::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_kube_services::run_periodic: enumerate Failed to kube services: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - @mutex.lock + rescue => errorStr + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_services::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_kube_services::run_periodic: enumerate Failed to kube services: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - @mutex.unlock end - - end # Kube_Services_Input - - end # module - - \ No newline at end of file + @mutex.lock + end + @mutex.unlock + end + end # Kube_Services_Input +end # module From 4b8708b13c20060794a3ed47262e2383ac56a7f9 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 10 Jul 2019 10:44:43 -0700 Subject: [PATCH 105/160] Fix Region space error (#239) * Trim spaces in AKS_REGION This is not an issue for normal AKS Monitoring Addon Onboarding. ONLY an issue for backdoor onboarding * Fix out_mdm parsing error --- source/code/plugin/out_mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index a81da0fbc..69ef25580 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -69,7 +69,7 @@ def start aks_region = aks_region.gsub(" ","") - @@post_request_url = @@post_request_url_template % {aks_region: aks_region), aks_resource_id: aks_resource_id} + @@post_request_url = @@post_request_url_template % {aks_region: aks_region, aks_resource_id: aks_resource_id} @post_request_uri = URI.parse(@@post_request_url) @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) @http_client.use_ssl = true From 1cd9eee6027fb6c2f131336800caa595f4bbedf0 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 10 Jul 2019 13:45:04 -0700 Subject: [PATCH 106/160] Removing buffer chunk size and buffer max size from fluentbit conf (#240) * hard code config for UST CCP team * fix config * fix config after discussion * fix error log to get errros * fix config * update config * Add telemetry * Rashmi/promcustomconfig (#231) * changes * formatting changes * changes * changes * changes * changes * changes * changes * changes * changes * adding telemetry * changes * changes * changes * changes * changes * changes * changes * cahnges * changes * Rashmi/promcustomconfig (#236) * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * fix exceptions * changes to remove some exceptions * exception fixes * changes * changes for poduid nil check * removing buffer chunk size and buffer max size from fluentbit conf --- installer/conf/td-agent-bit.conf | 2 -- 1 file changed, 2 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index e7aabd242..ab79710c7 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -12,8 +12,6 @@ DB.Sync Off Parser docker Mem_Buf_Limit 10m - Buffer_Chunk_Size 1m - Buffer_Max_Size 1m Rotate_Wait 20 Refresh_Interval 30 Path_Key filepath From 788ab8bfb5eede90578ad1655883692cf211b349 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 10 Jul 2019 17:44:53 -0700 Subject: [PATCH 107/160] changes (#243) --- source/code/plugin/KubernetesApiClient.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 58a276cfd..4cbf8bb40 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -333,7 +333,11 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName # instead of the actual poduid. Since this uid is not being surface into the UX # its ok to use this. # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] + if pod["metadata"]["annotations"].nil? + next + else + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] + end else podUid = pod["metadata"]["uid"] end From 5ee482b09dfd7311ce2e3f164788d6b13919fe8a Mon Sep 17 00:00:00 2001 From: David Michelman Date: Mon, 15 Jul 2019 11:01:54 -0700 Subject: [PATCH 108/160] Collect container last state (#235) * updating the OMS agent to also collect container last state * changed a comment * git surrounded ContainerLastStatus code in a begin/rescue block * added a lot of error checking and logging --- source/code/plugin/in_kube_podinventory.rb | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index d0056fb14..9991c13e3 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -327,6 +327,38 @@ def parse_and_emit_records(podInventory, serviceList) record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] end end + + # Record the last state of the container. This may have information on why a container was killed. + begin + if !container["lastState"].nil? && container["lastState"].keys.length == 1 + lastStateName = container["lastState"].keys[0] + lastStateObject = container["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" + end + + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + newRecord = Hash.new + newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) + newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) + newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) + newRecord["finishedAt"] = lastStateObject["finishedAt"] # (ex: 2019-07-02T14:58:52Z) + + # only write to the output field if everything previously ran without error + record["ContainerLastStatus"] = newRecord + else + record["ContainerLastStatus"] = Hash.new + end + else + record["ContainerLastStatus"] = Hash.new + end + rescue => errorStr + $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + record["ContainerLastStatus"] = Hash.new + end + podRestartCount += containerRestartCount records.push(record.dup) From 378cc93a1307227cd154f08d6dabe7f6e6bec9fd Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 12 Aug 2019 11:36:09 -0700 Subject: [PATCH 109/160] Rashmi/fix prom telemetry (#247) * fix prom telemetry * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes --- installer/conf/td-agent-bit.conf | 5 +- installer/datafiles/base_container.data | 1 + .../scripts/td-agent-bit-conf-customizer.rb | 47 +++++++++++++++++++ .../code/plugin/CAdvisorMetricsAPIClient.rb | 34 ++++---------- source/code/plugin/KubernetesApiClient.rb | 6 +++ source/code/plugin/in_kube_nodes.rb | 23 ++++++++- source/code/plugin/in_kube_podinventory.rb | 15 ++++-- 7 files changed, 101 insertions(+), 30 deletions(-) create mode 100644 installer/scripts/td-agent-bit-conf-customizer.rb diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index ab79710c7..4e3de6c46 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -1,5 +1,6 @@ [SERVICE] - Flush 15 + #Default service flush interval is 15 seconds + ${SERVICE_FLUSH_INTERVAL} Log_Level info Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log @@ -12,6 +13,8 @@ DB.Sync Off Parser docker Mem_Buf_Limit 10m + ${TAIL_BUFFER_CHUNK_SIZE} + ${TAIL_BUFFER_MAX_SIZE} Rotate_Wait 20 Refresh_Interval 30 Path_Key filepath diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index fe1635335..62a6f6885 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -114,6 +114,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root /opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/td-agent-bit-conf-customizer.rb; installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root diff --git a/installer/scripts/td-agent-bit-conf-customizer.rb b/installer/scripts/td-agent-bit-conf-customizer.rb new file mode 100644 index 000000000..1e62e3cc2 --- /dev/null +++ b/installer/scripts/td-agent-bit-conf-customizer.rb @@ -0,0 +1,47 @@ +#!/usr/local/bin/ruby + +@td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" + +@default_service_interval = "15" + +def is_number?(value) + true if Integer(value) rescue false +end + +def substituteFluentBitPlaceHolders + begin + # Replace the fluentbit config file with custom values if present + puts "config::Starting to substitute the placeholders in td-agent-bit.conf file for log collection" + + interval = ENV["FBIT_SERVICE_FLUSH_INTERVAL"] + bufferChunkSize = ENV["FBIT_TAIL_BUFFER_CHUNK_SIZE"] + bufferMaxSize = ENV["FBIT_TAIL_BUFFER_MAX_SIZE"] + + serviceInterval = (!interval.nil? && is_number?(interval)) ? interval : @default_service_interval + serviceIntervalSetting = "Flush " + serviceInterval + + tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize)) ? bufferChunkSize : nil + + tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize)) ? bufferMaxSize : nil + + text = File.read(@td_agent_bit_conf_path) + new_contents = text.gsub("${SERVICE_FLUSH_INTERVAL}", serviceIntervalSetting) + if !tailBufferChunkSize.nil? + new_contents = new_contents.gsub("${TAIL_BUFFER_CHUNK_SIZE}", "Buffer_Chunk_Size " + tailBufferChunkSize + "m") + else + new_contents = new_contents.gsub("\n ${TAIL_BUFFER_CHUNK_SIZE}\n", "\n") + end + if !tailBufferMaxSize.nil? + new_contents = new_contents.gsub("${TAIL_BUFFER_MAX_SIZE}", "Buffer_Max_Size " + tailBufferMaxSize + "m") + else + new_contents = new_contents.gsub("\n ${TAIL_BUFFER_MAX_SIZE}\n", "\n") + end + + File.open(@td_agent_bit_conf_path, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in td-agent-bit.conf file" + rescue => errorStr + puts "td-agent-bit-config-customizer: error while substituting values: #{errorStr}" + end +end + +substituteFluentBitPlaceHolders diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index ec38bcbb5..09499b4cf 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -22,23 +22,11 @@ class CAdvisorMetricsAPIClient @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] - @rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] - - @rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] - - @rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] @dsPromFieldDropCount = ENV["TELEMETRY_DS_PROM_FIELDDROP_LENGTH"] - - @rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] - - @rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] @dsPromUrlCount = ENV["TELEMETRY_DS_PROM_URLS_LENGTH"] - @rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] - - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil @@ -118,17 +106,21 @@ def getCAdvisorUri(winNode) def getMetrics(winNode = nil) metricDataItems = [] begin + cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !cAdvisorStats.nil? + metricInfo = JSON.parse(cAdvisorStats.body) + end if !winNode.nil? hostName = winNode["Hostname"] operatingSystem = "Windows" else - hostName = (OMS::Common.get_hostname) + if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil? + hostName = metricInfo["node"]["nodeName"] + else + hostName = (OMS::Common.get_hostname) + end operatingSystem = "Linux" end - cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) - if !cAdvisorStats.nil? - metricInfo = JSON.parse(cAdvisorStats.body) - end if !metricInfo.nil? metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) @@ -228,18 +220,12 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["clusterLogTailPath"] = @clusterLogTailPath telemetryProps["clusterAgentSchemaVersion"] = @clusterAgentSchemaVersion end - #telemetry about prometheus metric collections settings + #telemetry about prometheus metric collections settings for daemonset if (File.file?(@promConfigMountPath)) - telemetryProps["rsPromInt"] = @rsPromInterval telemetryProps["dsPromInt"] = @dsPromInterval - telemetryProps["rsPromFPC"] = @rsPromFieldPassCount telemetryProps["dsPromFPC"] = @dsPromFieldPassCount - telemetryProps["rsPromFDC"] = @rsPromFieldDropCount telemetryProps["dsPromFDC"] = @dsPromFieldDropCount - telemetryProps["rsPromServ"] = @rsPromK8sServiceCount - telemetryProps["rsPromUrl"] = @rsPromUrlCount telemetryProps["dsPromUrl"] = @dsPromUrlCount - telemetryProps["rsPromMonPods"] = @rsPromMonitorPods end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 4cbf8bb40..61cbaea00 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -355,6 +355,8 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = nodeName metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName @@ -378,6 +380,8 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = nodeName metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName @@ -420,6 +424,8 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = node["metadata"]["name"] + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = node["metadata"]["name"] metricProps["ObjectName"] = "K8SNode" metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] metricProps["Collections"] = [] diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 0310fa419..24ab51d4c 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -7,6 +7,14 @@ class Kube_nodeInventory_Input < Input @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" + @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" + + @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] + @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] + @@rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] + @@rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] + @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] + @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] def initialize super @@ -124,15 +132,26 @@ def enumerate # Adding telemetry to send node telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) + if (timeDifferenceInMinutes >= 10) properties = {} properties["Computer"] = record["Computer"] properties["KubeletVersion"] = record["KubeletVersion"] properties["OperatingSystem"] = nodeInfo["operatingSystem"] properties["DockerVersion"] = dockerVersion + capacityInfo = items["status"]["capacity"] - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true end end diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 9991c13e3..f41ce9095 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -182,6 +182,7 @@ def parse_and_emit_records(podInventory, serviceList) batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new controllerSet = Set.new [] + controllerData = {} telemetryFlush = false winContainerCount = 0 begin #begin block start @@ -277,6 +278,13 @@ def parse_and_emit_records(podInventory, serviceList) record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] if telemetryFlush == true controllerSet.add(record["ControllerKind"] + record["ControllerName"]) + #Adding controller kind to telemetry ro information about customer workload + if (controllerData[record["ControllerKind"]].nil?) + controllerData[record["ControllerKind"]] = 1 + else + controllerValue = controllerData[record["ControllerKind"]] + controllerData[record["ControllerKind"]] += 1 + end end end podRestartCount = 0 @@ -329,7 +337,7 @@ def parse_and_emit_records(podInventory, serviceList) end # Record the last state of the container. This may have information on why a container was killed. - begin + begin if !container["lastState"].nil? && container["lastState"].keys.length == 1 lastStateName = container["lastState"].keys[0] lastStateObject = container["lastState"][lastStateName] @@ -338,7 +346,7 @@ def parse_and_emit_records(podInventory, serviceList) end if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") - newRecord = Hash.new + newRecord = Hash.new newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) @@ -403,7 +411,8 @@ def parse_and_emit_records(podInventory, serviceList) telemetryProperties["Computer"] = @@hostName ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {}) - ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, {}) + telemetryProperties["ControllerData"] = controllerData.to_json + ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, telemetryProperties) if winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = winContainerCount ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) From df60197b920e4b2641ad2746dc521fe0e643966b Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 14 Aug 2019 16:55:59 -0700 Subject: [PATCH 110/160] Merge Health Model work into ci_feature behind a feature flag Pending perf testing (#246) Merge Health to ci_feature --- Rakefile | 9 + build/Makefile | 13 +- installer/conf/container.conf | 33 +- installer/conf/health_model_definition.json | 248 ++++++++++++ installer/conf/healthmonitorconfig.json | 31 ++ installer/conf/kube.conf | 36 +- installer/datafiles/base_container.data | 55 ++- installer/scripts/tomlparser.rb | 40 +- source/code/plugin/KubernetesApiClient.rb | 25 +- .../filter_cadvisor_health_container.rb | 263 +++++++++++++ .../plugin/filter_cadvisor_health_node.rb | 267 +++++++++++++ .../plugin/filter_health_model_builder.rb | 233 +++++++++++ .../plugin/health/agg_monitor_id_labels.rb | 26 ++ .../code/plugin/health/aggregate_monitor.rb | 193 +++++++++ .../aggregate_monitor_state_finalizer.rb | 33 ++ .../plugin/health/cluster_health_state.rb | 115 ++++++ .../plugin/health/health_hierarchy_builder.rb | 76 ++++ .../health/health_kube_api_down_handler.rb | 27 ++ .../health/health_kubernetes_resources.rb | 102 +++++ .../health/health_missing_signal_generator.rb | 142 +++++++ .../code/plugin/health/health_model_buffer.rb | 29 ++ .../plugin/health/health_model_builder.rb | 37 ++ .../plugin/health/health_model_constants.rb | 81 ++++ .../health/health_model_definition_parser.rb | 50 +++ .../plugin/health/health_monitor_helpers.rb | 36 ++ .../plugin/health/health_monitor_optimizer.rb | 52 +++ .../plugin/health/health_monitor_provider.rb | 123 ++++++ .../plugin/health/health_monitor_record.rb | 10 + .../plugin/health/health_monitor_state.rb | 214 ++++++++++ .../plugin/health/health_monitor_utils.rb | 369 ++++++++++++++++++ .../plugin/health/health_signal_reducer.rb | 51 +++ source/code/plugin/health/monitor_factory.rb | 28 ++ source/code/plugin/health/monitor_set.rb | 44 +++ .../health/node_monitor_hierarchy_reducer.rb | 33 ++ .../plugin/health/parent_monitor_provider.rb | 86 ++++ source/code/plugin/health/unit_monitor.rb | 26 ++ source/code/plugin/in_cadvisor_perf.rb | 10 +- source/code/plugin/in_kube_events.rb | 6 +- source/code/plugin/in_kube_health.rb | 307 +++++++++++++++ .../filter_health_model_builder_test.rb | 54 +++ .../plugin/health/aggregate_monitor_spec.rb | 256 ++++++++++++ .../aggregate_monitor_state_finalizer_spec.rb | 59 +++ test/code/plugin/health/ca.crt | 1 + .../health/cluster_health_state_spec.rb | 37 ++ .../health/health_hierarchy_builder_spec.rb | 11 + .../health/health_kubernetes_resource_spec.rb | 222 +++++++++++ .../health_missing_signal_generator_spec.rb | 79 ++++ .../plugin/health/health_model_buffer_spec.rb | 25 ++ .../health/health_model_builder_spec.rb | 37 ++ .../health/health_model_builder_test.rb | 337 ++++++++++++++++ .../health_model_definition_parser_spec.rb | 24 ++ .../health/health_monitor_state_spec.rb | 176 +++++++++ .../health/health_signal_reducer_spec.rb | 96 +++++ .../health/kube_api_down_handler_spec.rb | 26 ++ .../plugin/health/monitor_factory_spec.rb | 28 ++ test/code/plugin/health/monitor_set_spec.rb | 58 +++ .../health/parent_monitor_provider_spec.rb | 144 +++++++ .../health/test_health_model_definition.json | 42 ++ test/code/plugin/health/unit_monitor_spec.rb | 20 + test/code/plugin/health/unit_monitor_test.rb | 16 + test/code/plugin/test_helpers.rb | 3 + 61 files changed, 5278 insertions(+), 32 deletions(-) create mode 100644 Rakefile create mode 100644 installer/conf/health_model_definition.json create mode 100644 installer/conf/healthmonitorconfig.json create mode 100644 source/code/plugin/filter_cadvisor_health_container.rb create mode 100644 source/code/plugin/filter_cadvisor_health_node.rb create mode 100644 source/code/plugin/filter_health_model_builder.rb create mode 100644 source/code/plugin/health/agg_monitor_id_labels.rb create mode 100644 source/code/plugin/health/aggregate_monitor.rb create mode 100644 source/code/plugin/health/aggregate_monitor_state_finalizer.rb create mode 100644 source/code/plugin/health/cluster_health_state.rb create mode 100644 source/code/plugin/health/health_hierarchy_builder.rb create mode 100644 source/code/plugin/health/health_kube_api_down_handler.rb create mode 100644 source/code/plugin/health/health_kubernetes_resources.rb create mode 100644 source/code/plugin/health/health_missing_signal_generator.rb create mode 100644 source/code/plugin/health/health_model_buffer.rb create mode 100644 source/code/plugin/health/health_model_builder.rb create mode 100644 source/code/plugin/health/health_model_constants.rb create mode 100644 source/code/plugin/health/health_model_definition_parser.rb create mode 100644 source/code/plugin/health/health_monitor_helpers.rb create mode 100644 source/code/plugin/health/health_monitor_optimizer.rb create mode 100644 source/code/plugin/health/health_monitor_provider.rb create mode 100644 source/code/plugin/health/health_monitor_record.rb create mode 100644 source/code/plugin/health/health_monitor_state.rb create mode 100644 source/code/plugin/health/health_monitor_utils.rb create mode 100644 source/code/plugin/health/health_signal_reducer.rb create mode 100644 source/code/plugin/health/monitor_factory.rb create mode 100644 source/code/plugin/health/monitor_set.rb create mode 100644 source/code/plugin/health/node_monitor_hierarchy_reducer.rb create mode 100644 source/code/plugin/health/parent_monitor_provider.rb create mode 100644 source/code/plugin/health/unit_monitor.rb create mode 100644 source/code/plugin/in_kube_health.rb create mode 100644 test/code/plugin/filter_health_model_builder_test.rb create mode 100644 test/code/plugin/health/aggregate_monitor_spec.rb create mode 100644 test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb create mode 100644 test/code/plugin/health/ca.crt create mode 100644 test/code/plugin/health/cluster_health_state_spec.rb create mode 100644 test/code/plugin/health/health_hierarchy_builder_spec.rb create mode 100644 test/code/plugin/health/health_kubernetes_resource_spec.rb create mode 100644 test/code/plugin/health/health_missing_signal_generator_spec.rb create mode 100644 test/code/plugin/health/health_model_buffer_spec.rb create mode 100644 test/code/plugin/health/health_model_builder_spec.rb create mode 100644 test/code/plugin/health/health_model_builder_test.rb create mode 100644 test/code/plugin/health/health_model_definition_parser_spec.rb create mode 100644 test/code/plugin/health/health_monitor_state_spec.rb create mode 100644 test/code/plugin/health/health_signal_reducer_spec.rb create mode 100644 test/code/plugin/health/kube_api_down_handler_spec.rb create mode 100644 test/code/plugin/health/monitor_factory_spec.rb create mode 100644 test/code/plugin/health/monitor_set_spec.rb create mode 100644 test/code/plugin/health/parent_monitor_provider_spec.rb create mode 100644 test/code/plugin/health/test_health_model_definition.json create mode 100644 test/code/plugin/health/unit_monitor_spec.rb create mode 100644 test/code/plugin/health/unit_monitor_test.rb create mode 100644 test/code/plugin/test_helpers.rb diff --git a/Rakefile b/Rakefile new file mode 100644 index 000000000..3733e71a3 --- /dev/null +++ b/Rakefile @@ -0,0 +1,9 @@ +require 'rake/testtask' + +task default: "test" + +Rake::TestTask.new do |task| + task.libs << "test" + task.pattern = './test/code/plugin/health/*_spec.rb' + task.warning = false +end \ No newline at end of file diff --git a/build/Makefile b/build/Makefile index b5312cfe3..257980160 100644 --- a/build/Makefile +++ b/build/Makefile @@ -91,9 +91,9 @@ CXXFLAGS = $(COMPILE_FLAGS) # Build targets ifeq ($(ULINUX),1) -all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit fluentbitplugin +all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) KIT_STATUS kit fluentbitplugin rubypluginstests else -all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) fluentbitplugin +all : $(OMI_ROOT)/output $(SCXPAL_INTERMEDIATE_DIR) PROVIDER_STATUS $(PROVIDER_LIBRARY) fluentbitplugin rubypluginstests endif clean : @@ -143,6 +143,15 @@ fluentbitplugin : make -C $(GO_SOURCE_DIR) fbplugin $(COPY) $(GO_SOURCE_DIR)/out_oms.so $(INTERMEDIATE_DIR) +rubypluginstests : + @echo "========================= Installing pre-reqs for running tests" + sudo apt-add-repository ppa:brightbox/ruby-ng -y + sudo apt-get update + sudo apt-get install ruby2.4 rake -y + sudo gem install minitest + @echo "========================= Running tests..." + rake test + #-------------------------------------------------------------------------------- # PAL build # diff --git a/installer/conf/container.conf b/installer/conf/container.conf index f41bd6f98..6d810a0e2 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -17,16 +17,22 @@ #cadvisor perf - type cadvisorperf - tag oms.api.cadvisorperf - run_interval 60s + type cadvisorperf + tag oms.api.cadvisorperf + run_interval 60s log_level debug + + type filter_cadvisor_health_node + log_level debug + + + #custom_metrics_mdm filter plugin type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes log_level info @@ -61,6 +67,25 @@ max_retry_wait 9m + + + @type forward + send_timeout 60s + recover_wait 10s + hard_timeout 60s + heartbeat_type tcp + + + host healthmodel-replicaset-service.kube-system + port 25227 + + + + @type file + path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + + + type out_mdm log_level debug diff --git a/installer/conf/health_model_definition.json b/installer/conf/health_model_definition.json new file mode 100644 index 000000000..1112fe158 --- /dev/null +++ b/installer/conf/health_model_definition.json @@ -0,0 +1,248 @@ +[ + { + "monitor_id": "user_workload_pods_ready", + "parent_monitor_id": "user_workload", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "user_workload", + "parent_monitor_id": "namespace", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "system_workload_pods_ready", + "parent_monitor_id": "system_workload", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "system_workload", + "parent_monitor_id": "k8s_infrastructure", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "kube_api_status", + "parent_monitor_id": "k8s_infrastructure", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "namespace", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": "all_namespaces" + }, + { + "monitor_id": "k8s_infrastructure", + "parent_monitor_id": "cluster", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "all_namespaces", + "parent_monitor_id": "all_workloads", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "all_workloads", + "parent_monitor_id": "cluster", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node_cpu_utilization", + "parent_monitor_id": "node", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node_memory_utilization", + "parent_monitor_id": "node", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node_condition", + "parent_monitor_id": "node", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "node", + "aggregation_algorithm": "worstOf", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": [ + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "master", + "id": "master_node_pool" + }, + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "id": "agent_node_pool" + } + ] + }, + { + "monitor_id": "master_node_pool", + "aggregation_algorithm": "percentage", + "aggregation_algorithm_params": { + "critical_threshold": 80.0, + "warning_threshold": 90.0 + }, + "parent_monitor_id": "all_nodes", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "agent_node_pool", + "aggregation_algorithm": "percentage", + "aggregation_algorithm_params": { + "state_threshold": 80.0 + }, + "labels": [ + "agentpool", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": "all_nodes" + }, + { + "monitor_id": "all_nodes", + "aggregation_algorithm": "worstOf", + "parent_monitor_id": "cluster", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "cluster", + "aggregation_algorithm": "worstOf", + "parent_monitor_id": null, + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "subscribed_capacity_cpu", + "parent_monitor_id": "capacity", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "subscribed_capacity_memory", + "parent_monitor_id": "capacity", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "capacity", + "parent_monitor_id": "all_workloads", + "labels": [ + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + } +] \ No newline at end of file diff --git a/installer/conf/healthmonitorconfig.json b/installer/conf/healthmonitorconfig.json new file mode 100644 index 000000000..28d562652 --- /dev/null +++ b/installer/conf/healthmonitorconfig.json @@ -0,0 +1,31 @@ +{ + "node_cpu_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "node_memory_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "container_cpu_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "container_memory_utilization": { + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }, + "user_workload_pods_ready": { + "WarnThresholdPercentage": 0.0, + "FailThresholdPercentage": 10.0, + "ConsecutiveSamplesForStateTransition": 2 + }, + "system_workload_pods_ready": { + "FailThresholdPercentage": 0.0, + "ConsecutiveSamplesForStateTransition": 2 + } +} \ No newline at end of file diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 0dfa3710e..4b4ec09ea 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -1,4 +1,9 @@ # Fluentd config file for OMS Docker - cluster components (kubeAPI) + + type forward + port 25227 + bind 0.0.0.0 + #Kubernetes pod inventory @@ -13,7 +18,7 @@ type kubeevents tag oms.containerinsights.KubeEvents run_interval 60s - log_level debug + log_level debug #Kubernetes logs @@ -47,6 +52,14 @@ log_level debug +#Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.ReplicaSet + run_interval 60s + log_level debug + + #cadvisor perf- Windows nodes type wincadvisorperf @@ -69,6 +82,9 @@ log_level info + + type filter_health_model_builder + type out_mdm log_level debug @@ -118,7 +134,7 @@ type out_oms_api log_level debug - buffer_chunk_limit 10m + buffer_chunk_limit 10m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer buffer_queue_limit 10 @@ -127,6 +143,8 @@ retry_wait 30s + + type out_oms log_level debug @@ -170,7 +188,7 @@ max_retry_wait 9m - + type out_oms log_level debug num_threads 5 @@ -214,4 +232,16 @@ retry_limit 10 retry_wait 30s max_retry_wait 9m + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_KubeHealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 62a6f6885..3dc1a18cd 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -112,10 +112,45 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root -/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root -/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root +/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root + + +/opt/microsoft/omsagent/plugin/filter_cadvisor_health_node.rb; source/code/plugin/filter_cadvisor_health_node.rb; 644; root; root +/opt/microsoft/omsagent/plugin/filter_health_model_builder.rb; source/code/plugin/filter_health_model_builder.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_kube_health.rb; source/code/plugin/in_kube_health.rb; 644; root; root +/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json; installer/conf/healthmonitorconfig.json; 644; root; root +/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json; installer/conf/health_model_definition.json; 644; root; root + + +/opt/microsoft/omsagent/plugin/health/aggregate_monitor.rb; source/code/plugin/health/aggregate_monitor.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/agg_monitor_id_labels.rb; source/code/plugin/health/agg_monitor_id_labels.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/aggregate_monitor_state_finalizer.rb; source/code/plugin/health/aggregate_monitor_state_finalizer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/cluster_health_state.rb; source/code/plugin/health/cluster_health_state.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_hierarchy_builder.rb; source/code/plugin/health/health_hierarchy_builder.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_kubernetes_resources.rb; source/code/plugin/health/health_kubernetes_resources.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_kube_api_down_handler.rb; source/code/plugin/health/health_kube_api_down_handler.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_missing_signal_generator.rb; source/code/plugin/health/health_missing_signal_generator.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_buffer.rb; source/code/plugin/health/health_model_buffer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_builder.rb; source/code/plugin/health/health_model_builder.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_constants.rb; source/code/plugin/health/health_model_constants.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/parent_monitor_provider.rb; source/code/plugin/health/parent_monitor_provider.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_model_definition_parser.rb; source/code/plugin/health/health_model_definition_parser.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_optimizer.rb; source/code/plugin/health/health_monitor_optimizer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_provider.rb; source/code/plugin/health/health_monitor_provider.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_record.rb; source/code/plugin/health/health_monitor_record.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_state.rb; source/code/plugin/health/health_monitor_state.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_utils.rb; source/code/plugin/health/health_monitor_utils.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_signal_reducer.rb; source/code/plugin/health/health_signal_reducer.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/monitor_factory.rb; source/code/plugin/health/monitor_factory.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/monitor_set.rb; source/code/plugin/health/monitor_set.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/unit_monitor.rb; source/code/plugin/health/unit_monitor.rb; 644; root; root + %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root @@ -129,6 +164,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft; 755; root; root; sysdir /etc/opt/microsoft/docker-cimprov; 755; root; root /etc/opt/microsoft/docker-cimprov/conf; 755; root; root +/etc/opt/microsoft/docker-cimprov/health; 755; root; root /etc/opt/omi; 755; root; root; sysdir /etc/opt/omi/conf; 755; root; root; sysdir @@ -142,6 +178,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent; 755; root; root; sysdir /opt/microsoft/omsagent/plugin; 755; root; root; sysdir +/opt/microsoft/omsagent/plugin/health; 755; root; root; sysdir /opt/omi; 755; root; root; sysdir /opt/omi/lib; 755; root; root; sysdir @@ -205,12 +242,24 @@ touch /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log +touch /var/opt/microsoft/docker-cimprov/log/health_monitors.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/health_monitors.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/health_monitors.log + +touch /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log + +touch /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf %Postuninstall_10 # If we're an upgrade, skip all of this cleanup -if ${{PERFORMING_UPGRADE_NOT}}; then +if ${{PERFORMING_UPGRADE_NOT}}; then # Clean up installinfo.txt file (registered as "conf" file to pass rpmcheck) rm -f /etc/opt/microsoft/docker-cimprov/conf/installinfo.txt* rm -f /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index c72e64127..067586629 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -1,8 +1,10 @@ #!/usr/local/bin/ruby require_relative "tomlrb" +require 'json' -@configMapMountPath = "/etc/config/settings/log-data-collection-settings" +@log_settings_config_map_mount_path = "/etc/config/settings/log-data-collection-settings" +@agent_settings_config_map_mount_path = "/etc/config/settings/agent-settings" @configVersion = "" @configSchemaVersion = "" # Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist @@ -16,16 +18,16 @@ @excludePath = "*.csv2" #some invalid path # Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap +def parseConfigMap(path) begin # Check to see if config map is created - if (File.file?(@configMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values" - parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted config map" + if (File.file?(path)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values from #{path}" + parsedConfig = Tomlrb.load_file(path, symbolize_keys: true) + puts "config::Successfully parsed mounted config map from #{path}" return parsedConfig else - puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults" + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for #{path}" @excludePath = "*_kube-system_*.log" return nil end @@ -117,19 +119,35 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::error::Exception while reading config settings for cluster level environment variable collection - #{errorStr}, using defaults" end end + + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] + puts "enable_health_model = #{@enable_health_model}" + end + rescue => errorStr + puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" + @enable_health_model = false + end end @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] puts "****************Start Config Processing********************" if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap + configMapSettings = {} + + #iterate over every *settings file and build a hash of settings + Dir["/etc/config/settings/*settings"].each{|file| + puts "Parsing File #{file}" + settings = parseConfigMap(file) + configMapSettings = configMapSettings.merge(settings) + } + if !configMapSettings.nil? populateSettingValuesFromConfigMap(configMapSettings) end else - if (File.file?(@configMapMountPath)) puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" - end @excludePath = "*_kube-system_*.log" end @@ -155,6 +173,8 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") + #health_model settings + file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 61cbaea00..48b25bf14 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -30,13 +30,13 @@ def initialize end class << self - def getKubeResourceInfo(resource) + def getKubeResourceInfo(resource, api_version: nil) headers = {} response = nil - @Log.info "Getting Kube resource" + @Log.info "Getting Kube resource api_version #{api_version}" @Log.info resource begin - resourceUri = getResourceUri(resource) + resourceUri = getResourceUri(resource, api_version: api_version) if !resourceUri.nil? uri = URI.parse(resourceUri) http = Net::HTTP.new(uri.host, uri.port) @@ -76,10 +76,23 @@ def getTokenStr end end - def getResourceUri(resource) + def getClusterRegion + if ENV["AKS_REGION"] + return ENV["AKS_REGION"] + else + @Log.warn ("Kubernetes environment variable not set AKS_REGION. Unable to get cluster region.") + return nil + end + end + + def getResourceUri(resource, api_version: nil) begin if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] - return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource + if !api_version.nil? + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + api_version + "/" + resource + end + api_version = @@ApiVersion + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + api_version + "/" + resource else @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") return nil @@ -125,6 +138,8 @@ def getClusterId return @@ClusterId if !@@ClusterId.nil? #By default initialize ClusterId to ClusterName. # In ACS/On-prem, we need to figure out how we can generate ClusterId + # Dilipr: Spoof the subid by generating md5 hash of cluster name, and taking some constant parts of it. + # e.g. md5 digest is 128 bits = 32 character in hex. Get first 16 and get a guid, and the next 16 to get resource id @@ClusterId = getClusterName begin cluster = ENV["AKS_RESOURCE_ID"] diff --git a/source/code/plugin/filter_cadvisor_health_container.rb b/source/code/plugin/filter_cadvisor_health_container.rb new file mode 100644 index 000000000..4090092a9 --- /dev/null +++ b/source/code/plugin/filter_cadvisor_health_container.rb @@ -0,0 +1,263 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + require_relative 'oms_common' + require_relative 'HealthMonitorUtils' + require_relative 'HealthMonitorState' + require_relative "ApplicationInsightsUtility" + + + class CAdvisor2ContainerHealthFilter < Filter + Fluent::Plugin.register_filter('filter_cadvisor_health_container', self) + + config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/health_monitors.log' + config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' + config_param :container_resource_refresh_interval_minutes, :integer, :default => 5 + + @@object_name_k8s_node = 'K8SNode' + @@object_name_k8s_container = 'K8SContainer' + + @@counter_name_cpu = 'cpuusagenanocores' + @@counter_name_memory_rss = 'memoryrssbytes' + + @@health_monitor_config = {} + + @@hostName = (OMS::Common.get_hostname) + @@clusterName = KubernetesApiClient.getClusterName + @@clusterId = KubernetesApiClient.getClusterId + @@clusterRegion = KubernetesApiClient.getClusterRegion + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + + def initialize + super + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + @last_resource_refresh = DateTime.now.to_time.to_i + @metrics_to_collect_hash = {} + end + + def configure(conf) + super + @log = HealthMonitorUtils.getLogHandle + @log.debug {'Starting filter_cadvisor2health plugin'} + end + + def start + super + @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) + @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" + node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) + @cpu_capacity = node_capacity[0] + @memory_capacity = node_capacity[1] + @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}" + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName) + @@health_monitor_config = HealthMonitorUtils.getHealthMonitorConfig + ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {}) + end + + def filter_stream(tag, es) + if !@@cluster_health_model_enabled + @log.info "Cluster Health Model disabled in filter_cadvisor_health_container" + return [] + end + new_es = MultiEventStream.new + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName) + records_count = 0 + es.each { |time, record| + begin + filtered_record = filter(tag, time, record) + if !filtered_record.nil? + new_es.add(time, filtered_record) + records_count += 1 + end + rescue => e + router.emit_error_event(tag, time, record, e) + end + } + @log.debug "Filter Records Count #{records_count}" + new_es + end + + def filter(tag, time, record) + begin + if record.key?("MonitorLabels") + return record + end + object_name = record['DataItems'][0]['ObjectName'] + counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + if @metrics_to_collect_hash.key?(counter_name.downcase) + metric_value = record['DataItems'][0]['Collections'][0]['Value'] + case object_name + when @@object_name_k8s_container + case counter_name.downcase + when @@counter_name_cpu + # @log.debug "Object Name #{object_name}" + # @log.debug "Counter Name #{counter_name}" + # @log.debug "Metric Value #{metric_value}" + #return process_container_cpu_record(record, metric_value) + when @@counter_name_memory_rss + #return process_container_memory_record(record, metric_value) + end + when @@object_name_k8s_node + case counter_name.downcase + when @@counter_name_cpu + #process_node_cpu_record(record, metric_value) + when @@counter_name_memory_rss + #process_node_memory_record(record, metric_value) + end + end + end + rescue => e + @log.debug "Error in filter #{e}" + @log.debug "record #{record}" + @log.debug "backtrace #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e) + return nil + end + end + + def process_container_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID + @log.debug "processing container cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + cpu_limit = container_metadata['cpuLimit'] + end + + if cpu_limit.to_s.empty? + #@log.info "CPU Limit is nil" + cpu_limit = @cpu_capacity + end + + #@log.info "cpu limit #{cpu_limit}" + + percent = (metric_value.to_f/cpu_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container CPU #{temp}" + return record + end + return nil + end + + def process_container_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID + #@log.debug "processing container memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + memory_limit = container_metadata['memoryLimit'] + end + + if memory_limit.to_s.empty? + #@log.info "Memory Limit is nil" + memory_limit = @memory_capacity + end + + #@log.info "memory limit #{memory_limit}" + + percent = (metric_value.to_f/memory_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container Memory #{temp}" + return record + end + return nil + end + + def process_node_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID + #@log.debug "processing node cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "CPU capacity #{@cpu_capacity}" + + percent = (metric_value.to_f/@cpu_capacity*100).round(2) + #@log.debug "Percentage of CPU limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_CPU_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName]) + # record = HealthMonitorSignalReducer.reduceSignal(@log, monitor_id, monitor_instance_id, @@health_monitor_config[monitor_id], node_name: @@hostName) + # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node CPU" + return health_record + end + return nil + end + + def process_node_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID + #@log.debug "processing node memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "Memory capacity #{@memory_capacity}" + + percent = (metric_value.to_f/@memory_capacity*100).round(2) + #@log.debug "Percentage of Memory limit: #{percent}" + state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_MEMORY_MONITOR_ID]) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node Memory" + return health_record + end + return nil + end + end +end diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb new file mode 100644 index 000000000..627a525e7 --- /dev/null +++ b/source/code/plugin/filter_cadvisor_health_node.rb @@ -0,0 +1,267 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + require_relative 'oms_common' + require_relative "ApplicationInsightsUtility" + require_relative "KubernetesApiClient" + Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } + + class CAdvisor2NodeHealthFilter < Filter + include HealthModel + Fluent::Plugin.register_filter('filter_cadvisor_health_node', self) + + attr_accessor :provider, :resources + + config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' + config_param :container_resource_refresh_interval_minutes, :integer, :default => 5 + config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' + + @@object_name_k8s_node = 'K8SNode' + @@object_name_k8s_container = 'K8SContainer' + + @@counter_name_cpu = 'cpuusagenanocores' + @@counter_name_memory_rss = 'memoryrssbytes' + + @@hm_log = HealthMonitorUtils.get_log_handle + @@hostName = (OMS::Common.get_hostname) + @@clusterName = KubernetesApiClient.getClusterName + @@clusterId = KubernetesApiClient.getClusterId + @@clusterRegion = KubernetesApiClient.getClusterRegion + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + + def initialize + begin + super + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + @last_resource_refresh = DateTime.now.to_time.to_i + @metrics_to_collect_hash = {} + @resources = HealthKubernetesResources.instance # this doesnt require node and pod inventory. So no need to populate them + @provider = HealthMonitorProvider.new(@@clusterId, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def configure(conf) + super + @log = HealthMonitorUtils.get_log_handle + @log.debug {'Starting filter_cadvisor2health plugin'} + end + + def start + super + @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) + @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" + node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) + @cpu_capacity = node_capacity[0] + @memory_capacity = node_capacity[1] + @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}" + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName) + ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {}) + end + + def filter_stream(tag, es) + if !@@cluster_health_model_enabled + @log.info "Cluster Health Model disabled in filter_cadvisor_health_node" + return [] + end + new_es = MultiEventStream.new + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName) + records_count = 0 + es.each { |time, record| + begin + filtered_record = filter(tag, time, record) + if !filtered_record.nil? + new_es.add(time, filtered_record) + records_count += 1 + end + rescue => e + @log.info "Error in filter_stream for filter_cadvisor_health_node #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + } + @log.debug "Filter Records Count #{records_count}" + new_es + end + + def filter(tag, time, record) + begin + if record.key?("MonitorLabels") + return record + end + object_name = record['DataItems'][0]['ObjectName'] + counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + if @metrics_to_collect_hash.key?(counter_name.downcase) + metric_value = record['DataItems'][0]['Collections'][0]['Value'] + case object_name + when @@object_name_k8s_container + case counter_name.downcase + when @@counter_name_cpu + # @log.debug "Object Name #{object_name}" + # @log.debug "Counter Name #{counter_name}" + # @log.debug "Metric Value #{metric_value}" + #return process_container_cpu_record(record, metric_value) + when @@counter_name_memory_rss + #return process_container_memory_record(record, metric_value) + end + when @@object_name_k8s_node + case counter_name.downcase + when @@counter_name_cpu + process_node_cpu_record(record, metric_value) + when @@counter_name_memory_rss + process_node_memory_record(record, metric_value) + end + end + end + rescue => e + @log.debug "Error in filter #{e}" + @log.debug "record #{record}" + @log.debug "backtrace #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e) + return nil + end + end + + def process_container_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID + @log.debug "processing container cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + cpu_limit = container_metadata['cpuLimit'] + end + + if cpu_limit.to_s.empty? + #@log.info "CPU Limit is nil" + cpu_limit = @cpu_capacity + end + + #@log.info "cpu limit #{cpu_limit}" + + percent = (metric_value.to_f/cpu_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(monitor_id)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container CPU #{temp}" + return record + end + return nil + end + + def process_container_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID + #@log.debug "processing container memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) + container_metadata = HealthMonitorUtils.getContainerMetadata(key) + if !container_metadata.nil? + memory_limit = container_metadata['memoryLimit'] + end + + if memory_limit.to_s.empty? + #@log.info "Memory Limit is nil" + memory_limit = @memory_capacity + end + + #@log.info "memory limit #{memory_limit}" + + percent = (metric_value.to_f/memory_limit*100).round(2) + #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key]) + #@log.info "Monitor Instance Id: #{monitor_instance_id}" + temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + @log.info "Processed Container Memory #{temp}" + return record + end + return nil + end + + def process_node_cpu_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID + #@log.debug "processing node cpu record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "CPU capacity #{@cpu_capacity}" + + percent = (metric_value.to_f/@cpu_capacity*100).round(2) + #@log.debug "Percentage of CPU limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_CPU_MONITOR_ID)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) + # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node CPU" + return health_record + end + return nil + end + + def process_node_memory_record(record, metric_value) + monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID + #@log.debug "processing node memory record" + if record.nil? + return nil + else + instance_name = record['DataItems'][0]['InstanceName'] + #@log.info "Memory capacity #{@memory_capacity}" + + percent = (metric_value.to_f/@memory_capacity*100).round(2) + #@log.debug "Percentage of Memory limit: #{percent}" + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_MEMORY_MONITOR_ID)) + #@log.debug "Computed State : #{state}" + timestamp = record['DataItems'][0]['Timestamp'] + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} + #@log.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName + @log.info "Processed Node Memory" + return health_record + end + return nil + end + end +end diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb new file mode 100644 index 000000000..0c1b378a0 --- /dev/null +++ b/source/code/plugin/filter_health_model_builder.rb @@ -0,0 +1,233 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. + +# frozen_string_literal: true + +module Fluent + require 'logger' + require 'json' + Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } + + + class FilterHealthModelBuilder < Filter + Fluent::Plugin.register_filter('filter_health_model_builder', self) + + config_param :enable_log, :integer, :default => 0 + config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log' + config_param :model_definition_path, :default => '/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json' + config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' + config_param :health_state_serialized_path, :default => '/mnt/azure/health_model_state.json' + attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator + include HealthModel + + @@rewrite_tag = 'oms.api.KubeHealth.AgentCollectionTime' + @@cluster_id = KubernetesApiClient.getClusterId + @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + + def initialize + begin + super + @buffer = HealthModel::HealthModelBuffer.new + @cluster_health_state = ClusterHealthState.new(@@token_file_path, @@cert_file_path) + @health_model_definition = HealthModel::ParentMonitorProvider.new(HealthModel::HealthModelDefinitionParser.new(@model_definition_path).parse_file) + @monitor_factory = HealthModel::MonitorFactory.new + @hierarchy_builder = HealthHierarchyBuilder.new(@health_model_definition, @monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + @state_finalizers = [HealthModel::AggregateMonitorStateFinalizer.new] + @monitor_set = HealthModel::MonitorSet.new + @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) + @kube_api_down_handler = HealthKubeApiDownHandler.new + @resources = HealthKubernetesResources.instance + @reducer = HealthSignalReducer.new + @state = HealthMonitorState.new + @generator = HealthMissingSignalGenerator.new + #TODO: cluster_labels needs to be initialized + @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + deserialized_state_info = @cluster_health_state.get_state + @state = HealthMonitorState.new + @state.initialize_state(deserialized_state_info) + @cluster_old_state = 'none' + @cluster_new_state = 'none' + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def configure(conf) + begin + super + @log = nil + if @enable_log + @log = Logger.new(@log_path, 'weekly') + @log.info 'Starting filter_health_model_builder plugin' + end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def start + super + end + + def shutdown + super + end + + def filter_stream(tag, es) + begin + if !@@cluster_health_model_enabled + @log.info "Cluster Health Model disabled in filter_health_model_builder" + return [] + end + new_es = MultiEventStream.new + time = Time.now + + if tag.start_with?("oms.api.KubeHealth.DaemonSet") + records = [] + if !es.nil? + es.each{|time, record| + records.push(record) + } + @buffer.add_to_buffer(records) + end + return [] + elsif tag.start_with?("oms.api.KubeHealth.ReplicaSet") + @log.info "TAG #{tag}" + records = [] + es.each{|time, record| + records.push(record) + } + @buffer.add_to_buffer(records) + records_to_process = @buffer.get_buffer + @buffer.reset_buffer + + health_monitor_records = [] + records_to_process.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + #HealthMonitorRecord + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + @provider.get_labels(record), + @provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + + health_monitor_records.push(health_monitor_record) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + @log.info "health_monitor_records.size #{health_monitor_records.size}" + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + # update state for the reduced set of signals + reduced_records = @reducer.reduce_signals(health_monitor_records, @resources) + reduced_records.each{|record| + @state.update_state(record, + @provider.get_config(record.monitor_id) + ) + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + record.state = @state.get_state(record.monitor_instance_id).new_state + } + @log.info "after deduping and removing gone objects reduced_records.size #{reduced_records.size}" + + reduced_records = @kube_api_down_handler.handle_kube_api_down(reduced_records) + @log.info "after kube api down handler health_monitor_records.size #{health_monitor_records.size}" + + #get the list of 'none' and 'unknown' signals + missing_signals = @generator.get_missing_signals(@@cluster_id, reduced_records, @resources, @provider) + + @log.info "after getting missing signals missing_signals.size #{missing_signals.size}" + #update state for missing signals + missing_signals.each{|signal| + + @state.update_state(signal, @provider.get_config(signal.monitor_id)) + @log.info "After Updating #{@state.get_state(signal.monitor_instance_id)} #{@state.get_state(signal.monitor_instance_id).new_state}" + # for unknown/none records, update the "monitor state" to be the latest state (new_state) of the monitor instance from the state + signal.state = @state.get_state(signal.monitor_instance_id).new_state + } + + @generator.update_last_received_records(reduced_records) + all_records = reduced_records.clone + all_records.push(*missing_signals) + + @log.info "after Adding missing signals all_records.size #{all_records.size}" + + # build the health model + @model_builder.process_records(all_records) + all_monitors = @model_builder.finalize_model + + @log.info "after building health_model #{all_monitors.size}" + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each{|monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + @state.update_state(monitor, + @provider.get_config(monitor.monitor_id) + ) + end + + instance_state = @state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send + + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" + + # for each key in monitor.keys, + # get the state from health_monitor_state + # generate the record to send + all_monitors.keys.each{|key| + record = @provider.get_record(all_monitors[key], state) + if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER && all_monitors.size > 1 + old_state = record[HealthMonitorRecordFields::OLD_STATE] + new_state = record[HealthMonitorRecordFields::NEW_STATE] + if old_state != new_state && @cluster_old_state != old_state && @cluster_new_state != new_state + ApplicationInsightsUtility.sendCustomEvent("HealthModel_ClusterStateChanged",{"old_state" => old_state , "new_state" => new_state, "monitor_count" => all_monitors.size}) + @log.info "sent telemetry for cluster state change from #{record['OldState']} to #{record['NewState']}" + @cluster_old_state = old_state + @cluster_new_state = new_state + end + end + #@log.info "#{record["Details"]} #{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + new_es.add(time, record) + } + + #emit the stream + router.emit_stream(@@rewrite_tag, new_es) + + #initialize monitor_set and model_builder + @monitor_set = HealthModel::MonitorSet.new + @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) + + #update cluster state custom resource + @cluster_health_state.update_state(@state.to_h) + + # return an empty event stream, else the match will throw a NoMethodError + return [] + elsif tag.start_with?("oms.api.KubeHealth.AgentCollectionTime") + # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream + es + else + raise 'Invalid tag #{tag} received' + end + + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" + return nil + end + end + end +end diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb new file mode 100644 index 000000000..48ca46184 --- /dev/null +++ b/source/code/plugin/health/agg_monitor_id_labels.rb @@ -0,0 +1,26 @@ +module HealthModel + class AggregateMonitorInstanceIdLabels + @@id_labels_mapping = { + MonitorId::SYSTEM_WORKLOAD => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME], + MonitorId::USER_WORKLOAD => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME], + MonitorId::NODE => [HealthMonitorLabels::AGENTPOOL, HealthMonitorLabels::ROLE, HealthMonitorLabels::HOSTNAME], + MonitorId::NAMESPACE => [HealthMonitorLabels::NAMESPACE], + MonitorId::AGENT_NODE_POOL => [HealthMonitorLabels::AGENTPOOL], + # MonitorId::ALL_AGENT_NODE_POOLS => [], + # MonitorId::ALL_NODE_POOLS => [], + # MonitorId::ALL_NODES => [], + # MonitorId::K8S_INFRASTRUCTURE => [], + # MonitorId::CLUSTER => [], + # MonitorId::WORKLOAD => [] + } + + def self.get_labels_for(monitor_id) + if @@id_labels_mapping.key?(monitor_id) + return @@id_labels_mapping[monitor_id] + else + return [] + end + + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/aggregate_monitor.rb b/source/code/plugin/health/aggregate_monitor.rb new file mode 100644 index 000000000..794f716ce --- /dev/null +++ b/source/code/plugin/health/aggregate_monitor.rb @@ -0,0 +1,193 @@ +# frozen_string_literal: true + +require_relative 'health_model_constants' +require 'json' + +module HealthModel + class AggregateMonitor + attr_accessor :monitor_id, :monitor_instance_id, :state, :transition_date_time, :aggregation_algorithm, :aggregation_algorithm_params, :labels, :is_aggregate_monitor, :details + attr_reader :member_monitors, :member_state_counts + + @@sort_key_order = { + MonitorState::UNKNOWN => 1, + MonitorState::CRITICAL => 2, + MonitorState::WARNING => 3, + MonitorState::HEALTHY => 4, + MonitorState::NONE => 5 + } + + # constructor + def initialize( + monitor_id, + monitor_instance_id, + state, + transition_date_time, + aggregation_algorithm, + aggregation_algorithm_params, + labels + ) + @monitor_id = monitor_id + @monitor_instance_id = monitor_instance_id + @state = state + @transition_date_time = transition_date_time + @aggregation_algorithm = aggregation_algorithm || AggregationAlgorithm::WORSTOF + @aggregation_algorithm_params = aggregation_algorithm_params + @labels = labels + @member_monitors = {} + @member_state_counts = {} + @is_aggregate_monitor = true + end + + # adds a member monitor as a child + def add_member_monitor(member_monitor_instance_id) + unless @member_monitors.key?(member_monitor_instance_id) + @member_monitors[member_monitor_instance_id] = true + end + end + + #removes a member monitor + def remove_member_monitor(member_monitor_instance_id) + if @member_monitors.key?(member_monitor_instance_id) + @member_monitors.delete(member_monitor_instance_id) + end + end + + # return the member monitors as an array + def get_member_monitors + @member_monitors.map(&:first) + end + + # calculates the state of the aggregate monitor based on aggregation algorithm and child monitor states + def calculate_state(monitor_set) + case @aggregation_algorithm + when AggregationAlgorithm::WORSTOF + @state = calculate_worst_of_state(monitor_set) + when AggregationAlgorithm::PERCENTAGE + @state = calculate_percentage_state(monitor_set) + else + raise 'No aggregation algorithm specified' + end + end + + def calculate_details(monitor_set) + @details = {} + @details['details'] = {} + @details['state'] = state + @details['timestamp'] = transition_date_time + ids = [] + member_monitor_instance_ids = get_member_monitors + member_monitor_instance_ids.each{|member_monitor_id| + member_monitor = monitor_set.get_monitor(member_monitor_id) + member_state = member_monitor.state + if @details['details'].key?(member_state) + ids = @details['details'][member_state] + if !ids.include?(member_monitor.monitor_instance_id) + ids.push(member_monitor.monitor_instance_id) + end + @details['details'][member_state] = ids + else + @details['details'][member_state] = [member_monitor.monitor_instance_id] + end + } + end + + # calculates the worst of state, given the member monitors + def calculate_worst_of_state(monitor_set) + + @member_state_counts = map_member_monitor_states(monitor_set) + + if member_state_counts.length === 0 + return MonitorState::NONE + end + + if member_state_counts.key?(MonitorState::CRITICAL) && member_state_counts[MonitorState::CRITICAL] > 0 + return MonitorState::CRITICAL + end + if member_state_counts.key?(MonitorState::ERROR) && member_state_counts[MonitorState::ERROR] > 0 + return MonitorState::ERROR + end + if member_state_counts.key?(MonitorState::WARNING) && member_state_counts[MonitorState::WARNING] > 0 + return MonitorState::WARNING + end + + if member_state_counts.key?(MonitorState::UNKNOWN) && member_state_counts[MonitorState::UNKNOWN] > 0 + return MonitorState::UNKNOWN + end + + if member_state_counts.key?(MonitorState::HEALTHY) && member_state_counts[MonitorState::HEALTHY] > 0 + return MonitorState::HEALTHY #healthy should win over none in aggregation + end + + return MonitorState::NONE + + end + + # calculates a percentage state, given the aggregation algorithm parameters + def calculate_percentage_state(monitor_set) + + #sort + #TODO: What if sorted_filtered is empty? is that even possible? + sorted_filtered = sort_filter_member_monitors(monitor_set) + + state_threshold = @aggregation_algorithm_params['state_threshold'].to_f + + size = sorted_filtered.size + if size == 1 + @state = sorted_filtered[0].state + else + count = ((state_threshold*size)/100).ceil + index = size - count + @state = sorted_filtered[index].state + end + end + + # maps states of member monitors to counts + def map_member_monitor_states(monitor_set) + member_monitor_instance_ids = get_member_monitors + if member_monitor_instance_ids.nil? || member_monitor_instance_ids.size == 0 + return {} + end + + state_counts = {} + + member_monitor_instance_ids.each {|monitor_instance_id| + + member_monitor = monitor_set.get_monitor(monitor_instance_id) + monitor_state = member_monitor.state + + if !state_counts.key?(monitor_state) + state_counts[monitor_state] = 1 + else + count = state_counts[monitor_state] + state_counts[monitor_state] = count+1 + end + } + + return state_counts; + end + + # Sort the member monitors in the following order +=begin + 1. Error + 2. Unknown + 3. Critical + 4. Warning + 5. Healthy + Remove 'none' state monitors +=end + def sort_filter_member_monitors(monitor_set) + member_monitor_instance_ids = get_member_monitors + member_monitors = [] + + member_monitor_instance_ids.each {|monitor_instance_id| + member_monitor = monitor_set.get_monitor(monitor_instance_id) + member_monitors.push(member_monitor) + } + + filtered = member_monitors.select{|monitor| monitor.state != MonitorState::NONE} + sorted = filtered.sort_by{ |monitor| [@@sort_key_order[monitor.state]] } + + return sorted + end + end +end diff --git a/source/code/plugin/health/aggregate_monitor_state_finalizer.rb b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb new file mode 100644 index 000000000..74e780924 --- /dev/null +++ b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb @@ -0,0 +1,33 @@ +module HealthModel + class AggregateMonitorStateFinalizer + + def finalize(monitor_set) + top_level_monitor = monitor_set.get_monitor(MonitorId::CLUSTER) + if !top_level_monitor.nil? + calculate_subtree_state(top_level_monitor, monitor_set) + end + monitor_set.get_map.each{|k,v| + if v.is_aggregate_monitor + v.calculate_details(monitor_set) + end + } + end + + private + def calculate_subtree_state(monitor, monitor_set) + if monitor.nil? || !monitor.is_aggregate_monitor + raise 'AggregateMonitorStateFinalizer:calculateSubtreeState Parameter monitor must be non-null AggregateMonitor' + end + + member_monitor_instance_ids = monitor.get_member_monitors # monitor_instance_ids + member_monitor_instance_ids.each{|member_monitor_instance_id| + member_monitor = monitor_set.get_monitor(member_monitor_instance_id) + + if !member_monitor.nil? && member_monitor.is_aggregate_monitor + calculate_subtree_state(member_monitor, monitor_set) + end + } + monitor.calculate_state(monitor_set) + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/cluster_health_state.rb b/source/code/plugin/health/cluster_health_state.rb new file mode 100644 index 000000000..ac7e05675 --- /dev/null +++ b/source/code/plugin/health/cluster_health_state.rb @@ -0,0 +1,115 @@ +require "net/http" +require "net/https" +require "uri" + +module HealthModel + class ClusterHealthState + + attr_reader :token_file_path, :cert_file_path, :log, :http_client, :uri, :token + @@resource_uri_template = "%{kube_api_server_url}/apis/azmon.container.insights/v1/namespaces/kube-system/healthstates/cluster-health-state" + + def initialize(token_file_path, cert_file_path) + @token_file_path = token_file_path + @cert_file_path = cert_file_path + @log = HealthMonitorHelpers.get_log_handle + @http_client = get_http_client + @token = get_token + end + + def update_state(state) + get_request = Net::HTTP::Get.new(@uri.request_uri) + + get_request["Authorization"] = "Bearer #{@token}" + @log.info "Making GET request to #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + get_response = @http_client.request(get_request) + @log.info "Got response of #{get_response.code} for #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + + if get_response.code.to_i == 404 # NOT found + #POST + update_request = Net::HTTP::Post.new(@uri.request_uri) + update_request["Content-Type"] = "application/json" + + elsif get_response.code.to_i == 200 # Update == Patch + #PATCH + update_request = Net::HTTP::Patch.new(@uri.request_uri) + update_request["Content-Type"] = "application/merge-patch+json" + end + update_request["Authorization"] = "Bearer #{@token}" + + update_request_body = get_update_request_body + update_request_body["state"] = state.to_json + update_request.body = update_request_body.to_json + + update_response = @http_client.request(update_request) + @log.info "Got a response of #{update_response.code} for #{update_request.method}" + end + + def get_state + get_request = Net::HTTP::Get.new(@uri.request_uri) + get_request["Authorization"] = "Bearer #{@token}" + @log.info "Making GET request to #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + get_response = @http_client.request(get_request) + @log.info "Got response of #{get_response.code} for #{@uri.request_uri} @ #{Time.now.utc.iso8601}" + + if get_response.code.to_i == 200 + return JSON.parse(JSON.parse(get_response.body)["state"]) + else + return {} + end + end + + private + def get_token() + begin + if File.exist?(@token_file_path) && File.readable?(@token_file_path) + token_str = File.read(@token_file_path).strip + return token_str + else + @log.info ("Unable to read token string from #{@token_file_path}") + return nil + end + end + end + + def get_http_client() + kube_api_server_url = get_kube_api_server_url + resource_uri = @@resource_uri_template % { + kube_api_server_url: kube_api_server_url + } + @uri = URI.parse(resource_uri) + http = Net::HTTP.new(@uri.host, @uri.port) + http.use_ssl = true + if !File.exist?(@cert_file_path) + raise "#{@cert_file_path} doesnt exist" + else + http.ca_file = @cert_file_path + end + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + return http + end + + def get_kube_api_server_url + if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}" + else + @log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") + if Gem.win_platform? #unit testing on windows dev machine + value = %x( kubectl -n default get endpoints kubernetes --no-headers) + url = "https://#{value.split(' ')[1]}" + return "https://localhost:8080" # This is NEVER used. this is just to return SOME value + end + return nil + end + end + + def get_update_request_body + body = {} + body["apiVersion"] = "azmon.container.insights/v1" + body["kind"] = "HealthState" + body["metadata"] = {} + body["metadata"]["name"] = "cluster-health-state" + body["metadata"]["namespace"] = "kube-system" + return body + end + end +end diff --git a/source/code/plugin/health/health_hierarchy_builder.rb b/source/code/plugin/health/health_hierarchy_builder.rb new file mode 100644 index 000000000..2da0050db --- /dev/null +++ b/source/code/plugin/health/health_hierarchy_builder.rb @@ -0,0 +1,76 @@ +require 'json' +module HealthModel + class HealthHierarchyBuilder + + attr_accessor :health_model_definition, :monitor_factory + + def initialize(health_model_definition, monitor_factory) + + if !health_model_definition.is_a?(ParentMonitorProvider) + raise "Invalid Type Expected: ParentMonitorProvider Actual: #{@health_model_definition.class.name}" + end + @health_model_definition = health_model_definition + + if !monitor_factory.is_a?(MonitorFactory) + raise "Invalid Type Expected: MonitorFactory Actual: #{@monitor_factory.class.name}" + end + @monitor_factory = monitor_factory + end + + def process_record(health_monitor_record, monitor_set) + if !health_monitor_record.is_a?(HealthMonitorRecord) + raise "Unexpected Type #{health_monitor_record.class}" + end + + # monitor state transition will always be on a unit monitor + child_monitor = @monitor_factory.create_unit_monitor(health_monitor_record) + monitor_set.add_or_update(child_monitor) + parent_monitor_id = @health_model_definition.get_parent_monitor_id(child_monitor) + monitor_labels = child_monitor.labels + monitor_id = child_monitor.monitor_id + + # to construct the parent monitor, + # 1. Child's labels + # 2. Parent monitor's config to determine what labels to copy + # 3. Parent Monitor Id + # 4. Monitor Id --> Labels to hash Mapping to generate the monitor instance id for aggregate monitors + + while !parent_monitor_id.nil? + #puts "Parent Monitor Id #{parent_monitor_id}" + # get the set of labels to copy to parent monitor + parent_monitor_labels = @health_model_definition.get_parent_monitor_labels(monitor_id, monitor_labels, parent_monitor_id) + # get the parent monitor configuration + parent_monitor_configuration = @health_model_definition.get_parent_monitor_config(parent_monitor_id) + #get monitor instance id for parent monitor. Does this belong in ParentMonitorProvider? + parent_monitor_instance_id = @health_model_definition.get_parent_monitor_instance_id(child_monitor.monitor_instance_id, parent_monitor_id, parent_monitor_labels) + # check if monitor set has the parent monitor id + # if not present, add + # if present, update the state based on the aggregation algorithm + parent_monitor = nil + if !monitor_set.contains?(parent_monitor_instance_id) + parent_monitor = @monitor_factory.create_aggregate_monitor(parent_monitor_id, parent_monitor_instance_id, parent_monitor_labels, parent_monitor_configuration['aggregation_algorithm'], parent_monitor_configuration['aggregation_algorithm_params'], child_monitor) + parent_monitor.add_member_monitor(child_monitor.monitor_instance_id) + else + parent_monitor = monitor_set.get_monitor(parent_monitor_instance_id) + # required to calculate the rollup state + parent_monitor.add_member_monitor(child_monitor.monitor_instance_id) + # update to the earliest of the transition times of child monitors + if child_monitor.transition_date_time < parent_monitor.transition_date_time + parent_monitor.transition_date_time = child_monitor.transition_date_time + end + end + + if parent_monitor.nil? + raise 'Parent_monitor should not be nil for #{monitor_id}' + end + + monitor_set.add_or_update(parent_monitor) + + child_monitor = parent_monitor + parent_monitor_id = @health_model_definition.get_parent_monitor_id(child_monitor) + monitor_labels = child_monitor.labels + monitor_id = child_monitor.monitor_id + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_kube_api_down_handler.rb b/source/code/plugin/health/health_kube_api_down_handler.rb new file mode 100644 index 000000000..7f7ba1bd3 --- /dev/null +++ b/source/code/plugin/health/health_kube_api_down_handler.rb @@ -0,0 +1,27 @@ +module HealthModel + class HealthKubeApiDownHandler + def initialize + @@monitors_to_change = [HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID, + HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID, + HealthMonitorConstants::NODE_CONDITION_MONITOR_ID, + HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID, + HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID] + end + + # update kube-api dependent monitors to be 'unknown' if kube-api is down or monitor is unavailable + def handle_kube_api_down(health_monitor_records) + health_monitor_records_map = {} + + health_monitor_records.map{|record| health_monitor_records_map[record.monitor_instance_id] = record} + if !health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) || (health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) && health_monitor_records_map[HealthMonitorConstants::KUBE_API_STATUS].state != 'pass') + #iterate over the map and set the state to unknown for related monitors + health_monitor_records.each{|health_monitor_record| + if @@monitors_to_change.include?(health_monitor_record.monitor_id) + health_monitor_record.state = HealthMonitorStates::UNKNOWN + end + } + end + return health_monitor_records + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb new file mode 100644 index 000000000..53f879bf5 --- /dev/null +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -0,0 +1,102 @@ +require 'singleton' + +module HealthModel + class HealthKubernetesResources + + include Singleton + attr_accessor :node_inventory, :pod_inventory, :deployment_inventory + attr_reader :nodes, :pods, :workloads + + def initialize + @node_inventory = [] + @pod_inventory = [] + @deployment_inventory = [] + @nodes = [] + @pods = [] + @workloads = [] + @log = HealthMonitorHelpers.get_log_handle + end + + def get_node_inventory + return @node_inventory + end + + def get_nodes + @nodes = [] + @node_inventory['items'].each {|node| + if !@nodes.include?(node['metadata']['name']) + @nodes.push(node['metadata']['name']) + end + + } + return @nodes + end + + def get_pod_inventory + return @pod_inventory + end + + def get_pods + return @pods + end + + def get_workload_names + @pods = [] + workload_names = {} + deployment_lookup = {} + @deployment_inventory['items'].each do |deployment| + match_labels = deployment['spec']['selector']['matchLabels'].to_h + namespace = deployment['metadata']['namespace'] + match_labels.each{|k,v| + deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" + } + end + @pod_inventory['items'].each do |pod| + begin + has_owner = !pod['metadata']['ownerReferences'].nil? + owner_kind = '' + if has_owner + owner_kind = pod['metadata']['ownerReferences'][0]['kind'] + controller_name = pod['metadata']['ownerReferences'][0]['name'] + else + owner_kind = pod['kind'] + controller_name = pod['metadata']['name'] + end + + namespace = pod['metadata']['namespace'] + + workload_name = '' + if owner_kind.nil? + owner_kind = 'Pod' + end + case owner_kind.downcase + when 'job' + # we are excluding jobs + next + when 'replicaset' + # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name + labels = pod['metadata']['labels'].to_h + labels.each {|k,v| + lookup_key = "#{namespace}-#{k}=#{v}" + if deployment_lookup.key?(lookup_key) + workload_name = deployment_lookup[lookup_key] + break + end + } + if workload_name.empty? + workload_name = "#{namespace}~~#{controller_name}" + end + when 'daemonset' + workload_name = "#{namespace}~~#{controller_name}" + else + workload_name = "#{namespace}~~#{pod['metadata']['name']}" + end + rescue => e + @log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" + end + workload_names[workload_name] = true + end + return workload_names.keys + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_missing_signal_generator.rb b/source/code/plugin/health/health_missing_signal_generator.rb new file mode 100644 index 000000000..ff7f6a390 --- /dev/null +++ b/source/code/plugin/health/health_missing_signal_generator.rb @@ -0,0 +1,142 @@ +module HealthModel + class HealthMissingSignalGenerator + attr_accessor :last_received_records, :current_received_records + attr_reader :missing_signals, :unknown_signals_hash + + def initialize() + @last_received_records = {} + @unknown_signals_hash = {} + end + + def get_missing_signals(cluster_id, health_monitor_records, health_k8s_inventory, provider) + missing_monitor_ids = [] + nodes = health_k8s_inventory.get_nodes + workload_names = health_k8s_inventory.get_workload_names + missing_signals_map = {} + missing_signals = [] + health_monitor_records_map = {} + health_monitor_records.map{ + |monitor| health_monitor_records_map[monitor.monitor_instance_id] = monitor + } + + node_signals_hash = {} + nodes.each{|node| + node_signals_hash[node] = [HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID] + } + log = HealthMonitorHelpers.get_log_handle + log.info "last_received_records #{@last_received_records.size} nodes #{nodes}" + @last_received_records.each{|monitor_instance_id, monitor| + if !health_monitor_records_map.key?(monitor_instance_id) + if HealthMonitorHelpers.is_node_monitor(monitor.monitor_id) + node_name = monitor.labels['kubernetes.io/hostname'] + new_monitor = HealthMonitorRecord.new( + monitor.monitor_id, + monitor.monitor_instance_id, + Time.now.utc.iso8601, + monitor.state, + monitor.labels, + monitor.config, + {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => ""} + ) + if !node_name.nil? && nodes.include?(node_name) + new_monitor.state = HealthMonitorStates::UNKNOWN + new_monitor.details["state"] = HealthMonitorStates::UNKNOWN + new_monitor.details["details"] = "Node present in inventory but no signal for #{monitor.monitor_id} from node #{node_name}" + @unknown_signals_hash[monitor_instance_id] = new_monitor + elsif !node_name.nil? && !nodes.include?(node_name) + new_monitor.state = HealthMonitorStates::NONE + new_monitor.details["state"] = HealthMonitorStates::NONE + new_monitor.details["details"] = "Node NOT present in inventory. node: #{node_name}" + end + missing_signals_map[monitor_instance_id] = new_monitor + log.info "Added missing signal #{new_monitor.monitor_instance_id} #{new_monitor.state}" + elsif HealthMonitorHelpers.is_pods_ready_monitor(monitor.monitor_id) + lookup = "#{monitor.labels[HealthMonitorLabels::NAMESPACE]}~~#{monitor.labels[HealthMonitorLabels::WORKLOAD_NAME]}" + new_monitor = HealthMonitorRecord.new( + monitor.monitor_id, + monitor.monitor_instance_id, + Time.now.utc.iso8601, + monitor.state, + monitor.labels, + monitor.config, + {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => ""} + ) + if !lookup.nil? && workload_names.include?(lookup) + new_monitor.state = HealthMonitorStates::UNKNOWN + new_monitor.details["state"] = HealthMonitorStates::UNKNOWN + new_monitor.details["details"] = "Workload present in inventory. But no signal for #{lookup}" + @unknown_signals_hash[monitor_instance_id] = new_monitor + elsif !lookup.nil? && !workload_names.include?(lookup) + new_monitor.state = HealthMonitorStates::NONE + new_monitor.details["state"] = HealthMonitorStates::NONE + new_monitor.details["details"] = "Workload #{lookup} NOT present in inventory" + end + missing_signals_map[monitor_instance_id] = new_monitor + end + end + } + + + health_monitor_records.each{|health_monitor_record| + # remove signals from the list of expected signals if we see them in the list of current signals + if HealthMonitorHelpers.is_node_monitor(health_monitor_record.monitor_id) + node_name = health_monitor_record.labels['kubernetes.io/hostname'] + if node_signals_hash.key?(node_name) + signals = node_signals_hash[node_name] + signals.delete(health_monitor_record.monitor_id) + if signals.size == 0 + node_signals_hash.delete(node_name) + end + end + end + } + + # if the hash is not empty, means we have missing signals + if node_signals_hash.size > 0 + # these signals were not sent previously + # these signals need to be assigned an unknown state + node_signals_hash.each{|node, monitor_ids| + monitor_ids.each{|monitor_id| + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(monitor_id, [cluster_id, node]) + new_monitor = HealthMonitorRecord.new( + monitor_id, + monitor_instance_id, + Time.now.utc.iso8601, + HealthMonitorStates::UNKNOWN, + provider.get_node_labels(node), + {}, + {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => "no signal received from node #{node}"} + ) + missing_signals_map[monitor_instance_id] = new_monitor + log.info "Added missing signal when node_signals_hash was not empty #{new_monitor.monitor_instance_id} #{new_monitor.state}" + } + } + end + + missing_signals_map.each{|k,v| + missing_signals.push(v) + } + + # if an unknown signal is present neither in missing signals or the incoming signals, change its state to none, and remove from unknown_signals + # in update_state of HealthMonitorState, send if latest_record_state is none + @unknown_signals_hash.each{|k,v| + if !missing_signals_map.key?(k) && !health_monitor_records_map.key?(k) + monitor_record = @unknown_signals_hash[k] + monitor_record.details["state"] = HealthMonitorStates::NONE # used for calculating the old and new states in update_state + monitor_record.state = HealthMonitorStates::NONE #used for calculating the aggregate monitor state + missing_signals.push(monitor_record) + @unknown_signals_hash.delete(k) + log.info "Updating state from unknown to none for #{k}" + end + } + return missing_signals + end + + def update_last_received_records(last_received_records) + last_received_records_map = {} + last_received_records.map {|record| last_received_records_map[record.monitor_instance_id] = record } + @last_received_records = last_received_records_map + end + end + +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_buffer.rb b/source/code/plugin/health/health_model_buffer.rb new file mode 100644 index 000000000..1ccfe7349 --- /dev/null +++ b/source/code/plugin/health/health_model_buffer.rb @@ -0,0 +1,29 @@ +module HealthModel + +=begin + Class that is used to create a buffer for collecting the health records +=end + class HealthModelBuffer + + attr_reader :records_buffer, :log + + def initialize + @records_buffer = [] + end + + # Returns the current buffer + def get_buffer + return @records_buffer + end + + # adds records to the buffer + def add_to_buffer(records) + @records_buffer.push(*records) + end + + # clears/resets the buffer + def reset_buffer + @records_buffer = [] + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_builder.rb b/source/code/plugin/health/health_model_builder.rb new file mode 100644 index 000000000..4cf802798 --- /dev/null +++ b/source/code/plugin/health/health_model_builder.rb @@ -0,0 +1,37 @@ +require_relative 'health_model_constants' +require 'time' + +module HealthModel + class HealthModelBuilder + attr_accessor :hierarchy_builder, :state_finalizers, :monitor_set + + def initialize(hierarchy_builder, state_finalizers, monitor_set) + @hierarchy_builder = hierarchy_builder + @state_finalizers = state_finalizers + @monitor_set = monitor_set + end + + def process_records(health_records) + health_records.each{|health_record| + @hierarchy_builder.process_record(health_record, @monitor_set) + } + end + + def finalize_model + if !@state_finalizers.is_a?(Array) + raise 'state finalizers should be an array' + end + + if @state_finalizers.length == 0 + raise '@state_finalizers length should not be zero or empty' + end + + @state_finalizers.each{|finalizer| + finalizer.finalize(@monitor_set) + } + + return @monitor_set.get_map + end + + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_constants.rb b/source/code/plugin/health/health_model_constants.rb new file mode 100644 index 000000000..82ae569f3 --- /dev/null +++ b/source/code/plugin/health/health_model_constants.rb @@ -0,0 +1,81 @@ +module HealthModel + class MonitorState + CRITICAL = "fail" + ERROR = "err" + WARNING = "warn" + NONE = "none" + HEALTHY = "pass" + UNKNOWN = "unknown" + end + + class AggregationAlgorithm + WORSTOF = "worstOf" + PERCENTAGE = "percentage" + end + + class MonitorId + CLUSTER = 'cluster'; + ALL_NODES = 'all_nodes'; + K8S_INFRASTRUCTURE = 'k8s_infrastructure' + + NODE = 'node'; + AGENT_NODE_POOL = 'agent_node_pool' + MASTER_NODE_POOL = 'master_node_pool' + ALL_AGENT_NODE_POOLS = 'all_agent_node_pools' + ALL_NODE_POOLS = 'all_node_pools'; + + WORKLOAD = 'all_workloads'; + CAPACITY = 'capacity'; + + USER_WORKLOAD = 'user_workload'; + SYSTEM_WORKLOAD = 'system_workload' + NAMESPACE = 'namespace'; + end + + class HealthMonitorRecordFields + CLUSTER_ID = "ClusterId" + MONITOR_ID = "MonitorId" + MONITOR_INSTANCE_ID = "MonitorInstanceId" + MONITOR_LABELS = "MonitorLabels" + DETAILS = "Details" + MONITOR_CONFIG = "MonitorConfig" + OLD_STATE = "OldState" + NEW_STATE = "NewState" + AGENT_COLLECTION_TIME = "AgentCollectionTime" + TIME_FIRST_OBSERVED = "TimeFirstObserved" + NODE_NAME = "NodeName" + NAMESPACE = "Namespace" + end + + class HealthMonitorConstants + NODE_CPU_MONITOR_ID = "node_cpu_utilization" + NODE_MEMORY_MONITOR_ID = "node_memory_utilization" + CONTAINER_CPU_MONITOR_ID = "container_cpu_utilization" + CONTAINER_MEMORY_MONITOR_ID = "container_memory_utilization" + NODE_CONDITION_MONITOR_ID = "node_condition" + WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_cpu" + WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_memory" + WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID = "container_cpu_utilization" + WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID = "container_memory_utilization" + KUBE_API_STATUS = "kube_api_status" + USER_WORKLOAD_PODS_READY_MONITOR_ID = "user_workload_pods_ready" + SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID = "system_workload_pods_ready" + end + + class HealthMonitorStates + PASS = "pass" + FAIL = "fail" + WARNING = "warn" + NONE = "none" + UNKNOWN = "unknown" + end + + class HealthMonitorLabels + WORKLOAD_NAME = "container.azm.ms/workload-name" + WORKLOAD_KIND = "container.azm.ms/workload-kind" + NAMESPACE = "container.azm.ms/namespace" + AGENTPOOL = "agentpool" + ROLE = "kubernetes.io/role" + HOSTNAME = "kubernetes.io/hostname" + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_model_definition_parser.rb b/source/code/plugin/health/health_model_definition_parser.rb new file mode 100644 index 000000000..f6c7a781d --- /dev/null +++ b/source/code/plugin/health/health_model_definition_parser.rb @@ -0,0 +1,50 @@ +=begin + Class to parse the health model definition. The definition expresses the relationship between monitors, how to roll up to an aggregate monitor, + and what labels to "pass on" to the parent monitor +=end +require 'json' + +module HealthModel + class HealthModelDefinitionParser + attr_accessor :health_model_definition_path, :health_model_definition + + # Constructor + def initialize(path) + @health_model_definition = {} + @health_model_definition_path = path + end + + # Parse the health model definition file and build the model roll-up hierarchy + def parse_file + if (!File.exist?(@health_model_definition_path)) + raise "File does not exist in the specified path" + end + + file = File.read(@health_model_definition_path) + temp_model = JSON.parse(file) + temp_model.each { |entry| + monitor_id = entry['monitor_id'] + parent_monitor_id = entry['parent_monitor_id'] + labels = entry['labels'] if entry['labels'] + aggregation_algorithm = entry['aggregation_algorithm'] if entry['aggregation_algorithm'] + aggregation_algorithm_params = entry['aggregation_algorithm_params'] if entry['aggregation_algorithm_params'] + if parent_monitor_id.is_a?(Array) + conditions = [] + parent_monitor_id.each{|condition| + key = condition['label'] + operator = condition['operator'] + value = condition['value'] + parent_id = condition['id'] + conditions.push({"key" => key, "operator" => operator, "value" => value, "parent_id" => parent_id}) + } + @health_model_definition[monitor_id] = {"conditions" => conditions, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} + elsif parent_monitor_id.is_a?(String) + @health_model_definition[monitor_id] = {"parent_monitor_id" => parent_monitor_id, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} + elsif parent_monitor_id.nil? + @health_model_definition[monitor_id] = {"parent_monitor_id" => nil, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} + end + } + @health_model_definition + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb new file mode 100644 index 000000000..9e2977a0e --- /dev/null +++ b/source/code/plugin/health/health_monitor_helpers.rb @@ -0,0 +1,36 @@ +require 'logger' +require 'digest' + +module HealthModel + # static class that provides a bunch of utility methods + class HealthMonitorHelpers + + @log_path = "/var/opt/microsoft/docker-cimprov/log/health_monitors.log" + + if Gem.win_platform? #unit testing on windows dev machine + @log_path = "C:\Temp\health_monitors.log" + end + + @log = Logger.new(@log_path, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + + class << self + def is_node_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID) + end + + def is_pods_ready_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + end + + def get_log_handle + return @log + end + + def get_monitor_instance_id(monitor_id, args = []) + string_to_hash = args.join("/") + return "#{monitor_id}-#{Digest::MD5.hexdigest(string_to_hash)}" + end + end + + end +end diff --git a/source/code/plugin/health/health_monitor_optimizer.rb b/source/code/plugin/health/health_monitor_optimizer.rb new file mode 100644 index 000000000..b33c8a986 --- /dev/null +++ b/source/code/plugin/health/health_monitor_optimizer.rb @@ -0,0 +1,52 @@ +module HealthModel + class HealthMonitorOptimizer + #ctor + def initialize + @@health_signal_timeout = 240 + @@first_record_sent = {} + end + + def should_send(monitor_instance_id, health_monitor_state, health_monitor_config) + + health_monitor_instance_state = health_monitor_state.get_state(monitor_instance_id) + health_monitor_records = health_monitor_instance_state.prev_records + health_monitor_config['ConsecutiveSamplesForStateTransition'].nil? ? samples_to_check = 1 : samples_to_check = health_monitor_config['ConsecutiveSamplesForStateTransition'].to_i + + latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning + latest_record_state = latest_record["state"] + latest_record_time = latest_record["timestamp"] #string representation of time + + new_state = health_monitor_instance_state.new_state + prev_sent_time = health_monitor_instance_state.prev_sent_record_time + time_first_observed = health_monitor_instance_state.state_change_time + + if latest_record_state.downcase == new_state.downcase + time_elapsed = (Time.parse(latest_record_time) - Time.parse(prev_sent_time)) / 60 + if time_elapsed > @@health_signal_timeout # minutes + return true + elsif !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + return true + else + return false + end + else + if samples_to_check == 1 + return true + elsif health_monitor_instance_state.prev_records.size == 1 && samples_to_check > 1 + return true + elsif health_monitor_instance_state.prev_records.size < samples_to_check + return false + else + # state change from previous sent state to latest record state + #check state of last n records to see if they are all in the same state + if (health_monitor_instance_state.is_state_change_consistent) + return true + else + return false + end + end + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb new file mode 100644 index 000000000..0c1cbf7f2 --- /dev/null +++ b/source/code/plugin/health/health_monitor_provider.rb @@ -0,0 +1,123 @@ +module HealthModel + class HealthMonitorProvider + + attr_accessor :cluster_labels, :health_kubernetes_resources, :monitor_configuration_path, :cluster_id + attr_reader :monitor_configuration + + def initialize(cluster_id, cluster_labels, health_kubernetes_resources, monitor_configuration_path) + @cluster_labels = Hash.new + cluster_labels.each{|k,v| @cluster_labels[k] = v} + @cluster_id = cluster_id + @health_kubernetes_resources = health_kubernetes_resources + @monitor_configuration_path = monitor_configuration_path + begin + @monitor_configuration = {} + file = File.open(@monitor_configuration_path, "r") + if !file.nil? + fileContents = file.read + @monitor_configuration = JSON.parse(fileContents) + file.close + end + rescue => e + @log.info "Error when opening health config file #{e}" + end + end + + def get_record(health_monitor_record, health_monitor_state) + + labels = Hash.new + @cluster_labels.each{|k,v| labels[k] = v} + monitor_id = health_monitor_record.monitor_id + monitor_instance_id = health_monitor_record.monitor_instance_id + health_monitor_instance_state = health_monitor_state.get_state(monitor_instance_id) + + + monitor_labels = health_monitor_record.labels + if !monitor_labels.empty? + monitor_labels.keys.each do |key| + labels[key] = monitor_labels[key] + end + end + + prev_records = health_monitor_instance_state.prev_records + time_first_observed = health_monitor_instance_state.state_change_time # the oldest collection time + new_state = health_monitor_instance_state.new_state # this is updated before formatRecord is called + old_state = health_monitor_instance_state.old_state + + config = get_config(monitor_id) + + if prev_records.size == 1 + details = prev_records[0] + else + details = prev_records + end + + time_observed = Time.now.utc.iso8601 + + monitor_record = {} + + monitor_record[HealthMonitorRecordFields::CLUSTER_ID] = @cluster_id + monitor_record[HealthMonitorRecordFields::MONITOR_LABELS] = labels.to_json + monitor_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + monitor_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + monitor_record[HealthMonitorRecordFields::NEW_STATE] = new_state + monitor_record[HealthMonitorRecordFields::OLD_STATE] = old_state + monitor_record[HealthMonitorRecordFields::DETAILS] = details.to_json + monitor_record[HealthMonitorRecordFields::MONITOR_CONFIG] = config.to_json + monitor_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = Time.now.utc.iso8601 + monitor_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_first_observed + + return monitor_record + end + + def get_config(monitor_id) + if @monitor_configuration.key?(monitor_id) + return @monitor_configuration[monitor_id] + else + return {} + end + end + + def get_labels(health_monitor_record) + monitor_labels = Hash.new + @cluster_labels.keys.each{|key| + monitor_labels[key] = @cluster_labels[key] + } + monitor_id = health_monitor_record[HealthMonitorRecordFields::MONITOR_ID] + case monitor_id + when HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID, HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID + + namespace = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['namespace'] + workload_name = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadName'] + workload_kind = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadKind'] + + monitor_labels[HealthMonitorLabels::WORKLOAD_NAME] = workload_name.split('~~')[1] + monitor_labels[HealthMonitorLabels::WORKLOAD_KIND] = workload_kind + monitor_labels[HealthMonitorLabels::NAMESPACE] = namespace + + when HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID + node_name = health_monitor_record[HealthMonitorRecordFields::NODE_NAME] + @health_kubernetes_resources.get_node_inventory['items'].each do |node| + if !node_name.nil? && !node['metadata']['name'].nil? && node_name == node['metadata']['name'] + if !node["metadata"].nil? && !node["metadata"]["labels"].nil? + monitor_labels = monitor_labels.merge(node["metadata"]["labels"]) + end + end + end + end + return monitor_labels + end + + def get_node_labels(node_name) + monitor_labels = {} + @health_kubernetes_resources.get_node_inventory['items'].each do |node| + if !node_name.nil? && !node['metadata']['name'].nil? && node_name == node['metadata']['name'] + if !node["metadata"].nil? && !node["metadata"]["labels"].nil? + monitor_labels = node["metadata"]["labels"] + end + end + end + return monitor_labels + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_record.rb b/source/code/plugin/health/health_monitor_record.rb new file mode 100644 index 000000000..873736c3a --- /dev/null +++ b/source/code/plugin/health/health_monitor_record.rb @@ -0,0 +1,10 @@ +HealthMonitorRecord = Struct.new( + :monitor_id, + :monitor_instance_id, + :transition_date_time, + :state, + :labels, + :config, + :details + ) do +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb new file mode 100644 index 000000000..c3df5e3a9 --- /dev/null +++ b/source/code/plugin/health/health_monitor_state.rb @@ -0,0 +1,214 @@ +module HealthModel + + HealthMonitorInstanceState = Struct.new(:prev_sent_record_time, :old_state, :new_state, :state_change_time, :prev_records, :is_state_change_consistent, :should_send) do + end + + # Class that is used to store the last sent state and latest monitors + # provides services like + # get_state -- returns the current state and details + # update_instance -- updates the state of the health monitor history records + # set_state -- sets the last health monitor state + class HealthMonitorState + + def initialize + @@monitor_states = {} + @@first_record_sent = {} + @@health_signal_timeout = 240 + end + + def get_state(monitor_instance_id) + if @@monitor_states.key?(monitor_instance_id) + return @@monitor_states[monitor_instance_id] + end + end + + def set_state(monitor_instance_id, health_monitor_instance_state) + @@monitor_states[monitor_instance_id] = health_monitor_instance_state + end + + def to_h + return @@monitor_states + end + + def initialize_state(deserialized_state) + @@monitor_states = {} + deserialized_state.each{|k,v| + health_monitor_instance_state_hash = JSON.parse(v) + state = HealthMonitorInstanceState.new(*health_monitor_instance_state_hash.values_at(*HealthMonitorInstanceState.members)) + state.prev_sent_record_time = health_monitor_instance_state_hash["prev_sent_record_time"] + state.old_state = health_monitor_instance_state_hash["old_state"] + state.new_state = health_monitor_instance_state_hash["new_state"] + state.state_change_time = health_monitor_instance_state_hash["state_change_time"] + state.prev_records = health_monitor_instance_state_hash["prev_records"] + state.is_state_change_consistent = health_monitor_instance_state_hash["is_state_change_consistent"] || false + state.should_send = health_monitor_instance_state_hash["should_send"] + @@monitor_states[k] = state + @@first_record_sent[k] = true + + } + end + +=begin +when do u send? +--------------- +1. if the signal hasnt been sent before +2. if there is a "consistent" state change for monitors +3. if the signal is stale (> 4hrs) +4. If the latest state is none +=end + def update_state(monitor, #UnitMonitor/AggregateMonitor + monitor_config #Hash + ) + samples_to_keep = 1 + monitor_instance_id = monitor.monitor_instance_id + log = HealthMonitorHelpers.get_log_handle + current_time = Time.now.utc.iso8601 + health_monitor_instance_state = get_state(monitor_instance_id) + if !health_monitor_instance_state.nil? + health_monitor_instance_state.is_state_change_consistent = false + health_monitor_instance_state.should_send = false + set_state(monitor_instance_id, health_monitor_instance_state) # reset is_state_change_consistent + end + + if !monitor_config.nil? && !monitor_config['ConsecutiveSamplesForStateTransition'].nil? + samples_to_keep = monitor_config['ConsecutiveSamplesForStateTransition'].to_i + end + + if @@monitor_states.key?(monitor_instance_id) + health_monitor_instance_state = @@monitor_states[monitor_instance_id] + health_monitor_records = health_monitor_instance_state.prev_records #This should be an array + + if health_monitor_records.size == samples_to_keep + health_monitor_records.delete_at(0) + end + health_monitor_records.push(monitor.details) + health_monitor_instance_state.prev_records = health_monitor_records + @@monitor_states[monitor_instance_id] = health_monitor_instance_state + else + # if samples_to_keep == 1, then set new state to be the health_monitor_record state, else set it as none + + old_state = HealthMonitorStates::NONE + new_state = HealthMonitorStates::NONE + if samples_to_keep == 1 + new_state = monitor.state + end + + health_monitor_instance_state = HealthMonitorInstanceState.new( + monitor.transition_date_time, + old_state, + new_state, + monitor.transition_date_time, + [monitor.details]) + + health_monitor_instance_state.should_send = true + @@monitor_states[monitor_instance_id] = health_monitor_instance_state + end + + + # update old and new state based on the history and latest record. + # TODO: this is a little hairy. Simplify + + health_monitor_records = health_monitor_instance_state.prev_records + if monitor_config['ConsecutiveSamplesForStateTransition'].nil? + samples_to_check = 1 + else + samples_to_check = monitor_config['ConsecutiveSamplesForStateTransition'].to_i + end + + latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning + latest_record_state = latest_record["state"] + latest_record_time = latest_record["timestamp"] #string representation of time + + new_state = health_monitor_instance_state.new_state + prev_sent_time = health_monitor_instance_state.prev_sent_record_time + + # if the last sent state (new_state is different from latest monitor state) + if latest_record_state.downcase == new_state.downcase + time_elapsed = (Time.parse(latest_record_time) - Time.parse(prev_sent_time)) / 60 + # check if health signal has "timed out" + if time_elapsed > @@health_signal_timeout # minutes + # update record for last sent record time + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.should_send = true + #log.debug "After Updating Monitor State #{health_monitor_instance_state}" + set_state(monitor_instance_id, health_monitor_instance_state) + log.debug "#{monitor_instance_id} condition: signal timeout should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + # check if the first record has been sent + elsif !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + health_monitor_instance_state.should_send = true + set_state(monitor_instance_id, health_monitor_instance_state) + end + # latest state is different that last sent state + else + #if latest_record_state is none, send + if latest_record_state.downcase == HealthMonitorStates::NONE + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state #initially old = new, so when state change occurs, assign old to be new, and set new to be the latest record state + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.state_change_time = current_time + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.should_send = true + if !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + end + set_state(monitor_instance_id, health_monitor_instance_state) + log.debug "#{monitor_instance_id} condition: NONE state should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + # if it is a monitor that needs to instantly notify on state change, update the state + # mark the monitor to be sent + elsif samples_to_check == 1 + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state #initially old = new, so when state change occurs, assign old to be new, and set new to be the latest record state + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.state_change_time = current_time + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.should_send = true + if !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + end + set_state(monitor_instance_id, health_monitor_instance_state) + log.debug "#{monitor_instance_id} condition: state change, samples_to_check = #{samples_to_check} should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + else + # state change from previous sent state to latest record state + #check state of last n records to see if they are all in the same state + if (is_state_change_consistent(health_monitor_records, samples_to_keep)) + first_record = health_monitor_records[0] + latest_record = health_monitor_records[health_monitor_records.size-1] #since we push new records to the end, and remove oldest records from the beginning + latest_record_state = latest_record["state"] + latest_record_time = latest_record["timestamp"] #string representation of time + + health_monitor_instance_state.old_state = health_monitor_instance_state.new_state + health_monitor_instance_state.is_state_change_consistent = true # This way it wont be recomputed in the optimizer. + health_monitor_instance_state.should_send = true + health_monitor_instance_state.new_state = latest_record_state + health_monitor_instance_state.prev_sent_record_time = current_time + health_monitor_instance_state.state_change_time = current_time + + set_state(monitor_instance_id, health_monitor_instance_state) + + if !@@first_record_sent.key?(monitor_instance_id) + @@first_record_sent[monitor_instance_id] = true + end + log.debug "#{monitor_instance_id} condition: consistent state change, samples_to_check = #{samples_to_check} should_send #{health_monitor_instance_state.should_send} #{health_monitor_instance_state.old_state} --> #{health_monitor_instance_state.new_state}" + end + end + end + end + + private + def is_state_change_consistent(health_monitor_records, samples_to_check) + if health_monitor_records.nil? || health_monitor_records.size == 0 || health_monitor_records.size < samples_to_check + return false + end + i = 0 + while i < health_monitor_records.size - 1 + #log.debug "Prev: #{health_monitor_records[i].state} Current: #{health_monitor_records[i + 1].state}" + if health_monitor_records[i]["state"] != health_monitor_records[i + 1]["state"] + return false + end + i += 1 + end + return true + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb new file mode 100644 index 000000000..df47529e6 --- /dev/null +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -0,0 +1,369 @@ +require 'logger' +require 'digest' + +module HealthModel + # static class that provides a bunch of utility methods + class HealthMonitorUtils + + begin + if !Gem.win_platform? + require_relative '../KubernetesApiClient' + end + rescue => e + $log.info "Error loading KubernetesApiClient #{e.message}" + end + + @@node_inventory = [] + + @log_path = "/var/opt/microsoft/docker-cimprov/log/health_monitors.log" + + if Gem.win_platform? #unit testing on windows dev machine + @log_path = "C:\Temp\health_monitors.log" + end + + @log = Logger.new(@log_path, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + @@last_refresh_time = '2019-01-01T00:00:00Z' + + class << self + # compute the percentage state given a value and a monitor configuration + def compute_percentage_state(value, config) + + if config.nil? || config['WarnThresholdPercentage'].nil? + warn_percentage = nil + else + warn_percentage = config['WarnThresholdPercentage'].to_f + end + fail_percentage = config['FailThresholdPercentage'].to_f + + if value > fail_percentage + return HealthMonitorStates::FAIL + elsif !warn_percentage.nil? && value > warn_percentage + return HealthMonitorStates::WARNING + else + return HealthMonitorStates::PASS + end + end + + def is_node_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID) + end + + def is_pods_ready_monitor(monitor_id) + return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + end + + def is_cluster_health_model_enabled + enabled = ENV["AZMON_CLUSTER_ENABLE_HEALTH_MODEL"] + if !enabled.nil? && enabled.casecmp("true") == 0 + return true + else + return false + end + end + + def get_pods_ready_hash(pod_inventory, deployment_inventory) + pods_ready_percentage_hash = {} + deployment_lookup = {} + deployment_inventory['items'].each do |deployment| + match_labels = deployment['spec']['selector']['matchLabels'].to_h + namespace = deployment['metadata']['namespace'] + match_labels.each{|k,v| + deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" + } + end + pod_inventory['items'].each do |pod| + begin + has_owner = !pod['metadata']['ownerReferences'].nil? + owner_kind = '' + if has_owner + owner_kind = pod['metadata']['ownerReferences'][0]['kind'] + controller_name = pod['metadata']['ownerReferences'][0]['name'] + else + owner_kind = pod['kind'] + controller_name = pod['metadata']['name'] + #log.info "#{JSON.pretty_generate(pod)}" + end + + namespace = pod['metadata']['namespace'] + status = pod['status']['phase'] + + workload_name = '' + if owner_kind.nil? + owner_kind = 'Pod' + end + case owner_kind.downcase + when 'job' + # we are excluding jobs + next + when 'replicaset' + # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name + labels = pod['metadata']['labels'].to_h + labels.each {|k,v| + lookup_key = "#{namespace}-#{k}=#{v}" + if deployment_lookup.key?(lookup_key) + workload_name = deployment_lookup[lookup_key] + break + end + } + if workload_name.empty? + workload_name = "#{namespace}~~#{controller_name}" + end + when 'daemonset' + workload_name = "#{namespace}~~#{controller_name}" + else + workload_name = "#{namespace}~~#{pod['metadata']['name']}" + end + + if pods_ready_percentage_hash.key?(workload_name) + total_pods = pods_ready_percentage_hash[workload_name]['totalPods'] + pods_ready = pods_ready_percentage_hash[workload_name]['podsReady'] + else + total_pods = 0 + pods_ready = 0 + end + + total_pods += 1 + if status == 'Running' + pods_ready += 1 + end + + pods_ready_percentage_hash[workload_name] = {'totalPods' => total_pods, 'podsReady' => pods_ready, 'namespace' => namespace, 'workload_name' => workload_name, 'kind' => owner_kind} + rescue => e + log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" + end + end + return pods_ready_percentage_hash + end + + def get_node_state_from_node_conditions(node_conditions) + pass = false + node_conditions.each do |condition| + type = condition['type'] + status = condition['status'] + + if ((type == "NetworkUnavailable" || type == "OutOfDisk") && (status == 'True' || status == 'Unknown')) + return "fail" + elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) + return "warn" + elsif type == "Ready" && status == 'True' + pass = true + end + end + + if pass + return "pass" + else + return "fail" + end + end + + def get_resource_subscription(pod_inventory, metric_name, metric_capacity) + subscription = 0.0 + if !pod_inventory.empty? + pod_inventory['items'].each do |pod| + pod['spec']['containers'].each do |container| + if !container['resources']['requests'].nil? && !container['resources']['requests'][metric_name].nil? + subscription += KubernetesApiClient.getMetricNumericValue(metric_name, container['resources']['requests'][metric_name]) + end + end + end + end + #log.debug "#{metric_name} Subscription #{subscription}" + return subscription + end + + def get_cluster_cpu_memory_capacity(log) + begin + node_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + cluster_cpu_capacity = 0.0 + cluster_memory_capacity = 0.0 + if !node_inventory.empty? + node_inventory['items'].each do |node| + cpu_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "cpu", "cpuCapacityNanoCores") + if !cpu_capacity_json.nil? + cpu_capacity_json.each do |cpu_capacity_node| + if !cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + cluster_cpu_capacity += cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'] + end + end + log.info "Cluster CPU Limit #{cluster_cpu_capacity}" + else + log.info "Error getting cpu_capacity" + end + memory_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "memory", "memoryCapacityBytes") + if !memory_capacity_json.nil? + memory_capacity_json.each do |memory_capacity_node| + if !memory_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + cluster_memory_capacity += memory_capacity_node['DataItems'][0]['Collections'][0]['Value'] + end + end + log.info "Cluster Memory Limit #{cluster_memory_capacity}" + else + log.info "Error getting memory_capacity" + end + end + else + log.info "Unable to get cpu and memory capacity" + return [0.0, 0.0] + end + return [cluster_cpu_capacity, cluster_memory_capacity] + rescue => e + log.info e + end + end + + def refresh_kubernetes_api_data(log, hostName, force: false) + #log.debug "refresh_kubernetes_api_data" + if ( ((Time.now.utc - Time.parse(@@last_refresh_time)) / 60 ) < 5.0 && !force) + log.debug "Less than 5 minutes since last refresh at #{@@last_refresh_time}" + return + end + if force + log.debug "Force Refresh" + end + + begin + @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + if !hostName.nil? + podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=spec.nodeName%3D#{hostName}").body) + else + podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods").body) + end + podInventory['items'].each do |pod| + has_owner = !pod['metadata']['ownerReferences'].nil? + if !has_owner + workload_name = pod['metadata']['name'] + else + workload_name = pod['metadata']['ownerReferences'][0]['name'] + end + namespace = pod['metadata']['namespace'] + #TODO: Figure this out for container cpu/memory + #@@controllerMapping[workload_name] = namespace + #log.debug "workload_name #{workload_name} namespace #{namespace}" + pod['spec']['containers'].each do |container| + key = [pod['metadata']['uid'], container['name']].join('/') + + if !container['resources'].empty? && !container['resources']['limits'].nil? && !container['resources']['limits']['cpu'].nil? + cpu_limit_value = KubernetesApiClient.getMetricNumericValue('cpu', container['resources']['limits']['cpu']) + else + log.info "CPU limit not set for container : #{container['name']}. Using Node Capacity" + #TODO: Send warning health event #bestpractices + cpu_limit_value = @cpu_capacity + end + + if !container['resources'].empty? && !container['resources']['limits'].nil? && !container['resources']['limits']['memory'].nil? + #log.info "Raw Memory Value #{container['resources']['limits']['memory']}" + memory_limit_value = KubernetesApiClient.getMetricNumericValue('memory', container['resources']['limits']['memory']) + else + log.info "Memory limit not set for container : #{container['name']}. Using Node Capacity" + memory_limit_value = @memory_capacity + end + + #TODO: Figure this out for container cpu/memory + #@@containerMetadata[key] = {"cpuLimit" => cpu_limit_value, "memoryLimit" => memory_limit_value, "controllerName" => workload_name, "namespace" => namespace} + end + end + rescue => e + log.info "Error Refreshing Container Resource Limits #{e.backtrace}" + end + # log.info "Controller Mapping #{@@controllerMapping}" + # log.info "Node Inventory #{@@nodeInventory}" + # log.info "Container Metadata #{@@containerMetadata}" + # log.info "------------------------------------" + @@last_refresh_time = Time.now.utc.iso8601 + end + + def get_monitor_instance_id(monitor_id, args = []) + string_to_hash = args.join("/") + return "#{monitor_id}-#{Digest::MD5.hexdigest(string_to_hash)}" + end + + def ensure_cpu_memory_capacity_set(log, cpu_capacity, memory_capacity, hostname) + + log.info "ensure_cpu_memory_capacity_set cpu_capacity #{cpu_capacity} memory_capacity #{memory_capacity}" + if cpu_capacity != 0.0 && memory_capacity != 0.0 + log.info "CPU And Memory Capacity are already set" + return [cpu_capacity, memory_capacity] + end + + begin + @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + rescue Exception => e + log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + end + if !@@nodeInventory.nil? + cpu_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") + if !cpu_capacity_json.nil? + cpu_capacity_json.each do |cpu_info_node| + if !cpu_info_node['DataItems'][0]['Host'].nil? && cpu_info_node['DataItems'][0]['Host'] == hostname + if !cpu_info_node['DataItems'][0]['Collections'][0]['Value'].nil? + cpu_capacity = cpu_info_node['DataItems'][0]['Collections'][0]['Value'] + end + end + end + log.info "CPU Limit #{cpu_capacity}" + else + log.info "Error getting cpu_capacity" + end + memory_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "memory", "memoryCapacityBytes") + if !memory_capacity_json.nil? + memory_capacity_json.each do |memory_info_node| + if !memory_info_node['DataItems'][0]['Host'].nil? && memory_info_node['DataItems'][0]['Host'] == hostname + if !memory_info_node['DataItems'][0]['Collections'][0]['Value'].nil? + memory_capacity = memory_info_node['DataItems'][0]['Collections'][0]['Value'] + end + end + end + log.info "memory Limit #{memory_capacity}" + else + log.info "Error getting memory_capacity" + end + return [cpu_capacity, memory_capacity] + end + end + + def build_metrics_hash(metrics_to_collect) + metrics_to_collect_arr = metrics_to_collect.split(',').map(&:strip) + metrics_hash = metrics_to_collect_arr.map {|x| [x.downcase,true]}.to_h + return metrics_hash + end + + def get_health_monitor_config + health_monitor_config = {} + begin + file = File.open('/opt/microsoft/omsagent/plugin/healthmonitorconfig.json', "r") + if !file.nil? + fileContents = file.read + health_monitor_config = JSON.parse(fileContents) + file.close + end + rescue => e + log.info "Error when opening health config file #{e}" + end + return health_monitor_config + end + + def get_cluster_labels + labels = {} + cluster_id = KubernetesApiClient.getClusterId + region = KubernetesApiClient.getClusterRegion + labels['container.azm.ms/cluster-region'] = region + if !cluster_id.nil? + cluster_id_elements = cluster_id.split('/') + azure_sub_id = cluster_id_elements[2] + resource_group = cluster_id_elements[4] + cluster_name = cluster_id_elements[8] + labels['container.azm.ms/cluster-subscription-id'] = azure_sub_id + labels['container.azm.ms/cluster-resource-group'] = resource_group + labels['container.azm.ms/cluster-name'] = cluster_name + end + return labels + end + + def get_log_handle + return @log + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_signal_reducer.rb b/source/code/plugin/health/health_signal_reducer.rb new file mode 100644 index 000000000..4cf53e82c --- /dev/null +++ b/source/code/plugin/health/health_signal_reducer.rb @@ -0,0 +1,51 @@ +module HealthModel + # this class + # 1. dedupes daemonset signals and takes only the latest + # 2. removes signals for objects that are no longer in the inventory e.g. node might have sent signal before being scaled down + class HealthSignalReducer + def initialize + + end + + def reduce_signals(health_monitor_records, health_k8s_inventory) + nodes = health_k8s_inventory.get_nodes + workload_names = health_k8s_inventory.get_workload_names + reduced_signals_map = {} + reduced_signals = [] + health_monitor_records.each{|health_monitor_record| + monitor_instance_id = health_monitor_record.monitor_instance_id + monitor_id = health_monitor_record.monitor_id + if reduced_signals_map.key?(monitor_instance_id) + record = reduced_signals_map[monitor_instance_id] + if health_monitor_record.transition_date_time > record.transition_date_time # always take the latest record for a monitor instance id + puts 'Duplicate Daemon Set signal' + reduced_signals_map[monitor_instance_id] = health_monitor_record + end + elsif HealthMonitorHelpers.is_node_monitor(monitor_id) + node_name = health_monitor_record.labels['kubernetes.io/hostname'] + if (node_name.nil? || !nodes.include?(node_name)) # only add daemon set records if node is present in the inventory + next + end + reduced_signals_map[monitor_instance_id] = health_monitor_record + elsif HealthMonitorHelpers.is_pods_ready_monitor(monitor_id) + workload_name = health_monitor_record.labels[HealthMonitorLabels::WORKLOAD_NAME] + namespace = health_monitor_record.labels[HealthMonitorLabels::NAMESPACE] + lookup = "#{namespace}~~#{workload_name}" + if (workload_name.nil? || !workload_names.include?(lookup)) #only add pod record if present in the inventory + next + end + reduced_signals_map[monitor_instance_id] = health_monitor_record + else + reduced_signals_map[monitor_instance_id] = health_monitor_record + end + } + + reduced_signals_map.each{|k,v| + reduced_signals.push(v) + } + + return reduced_signals + end + + end +end \ No newline at end of file diff --git a/source/code/plugin/health/monitor_factory.rb b/source/code/plugin/health/monitor_factory.rb new file mode 100644 index 000000000..e6ec9d2c3 --- /dev/null +++ b/source/code/plugin/health/monitor_factory.rb @@ -0,0 +1,28 @@ +module HealthModel + class MonitorFactory + + def initialize + + end + + def create_unit_monitor(monitor_record) + return UnitMonitor.new(monitor_record.monitor_id, + monitor_record.monitor_instance_id, + monitor_record.state, + monitor_record.transition_date_time, + monitor_record.labels, + monitor_record.config, + monitor_record.details) + end + + def create_aggregate_monitor(monitor_id, monitor_instance_id, labels, aggregation_algorithm, aggregation_algorithm_params, child_monitor) + return AggregateMonitor.new(monitor_id, + monitor_instance_id, + child_monitor.state, + child_monitor.transition_date_time, + aggregation_algorithm, + aggregation_algorithm_params, + labels) + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/monitor_set.rb b/source/code/plugin/health/monitor_set.rb new file mode 100644 index 000000000..8d5994419 --- /dev/null +++ b/source/code/plugin/health/monitor_set.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module HealthModel + class MonitorSet + attr_accessor :monitors + + #constructor + def initialize + @monitors = {} + end + + # checks if the monitor is present in the set + def contains?(monitor_instance_id) + @monitors.key?(monitor_instance_id) + end + + # adds or updates the monitor + def add_or_update(monitor) + @monitors[monitor.monitor_instance_id] = monitor + end + + # gets the monitor given the monitor instance id + def get_monitor(monitor_instance_id) + @monitors[monitor_instance_id] if @monitors.key?(monitor_instance_id) + end + + # deletes a monitor from the set + def delete(monitor_instance_id) + if @monitors.key?(monitor_instance_id) + @monitors.delete(monitor_instance_id) + end + end + + # gets the size of the monitor set + def get_size + @monitors.length + end + + # gets the map of monitor instance id to monitors + def get_map + @monitors + end + end +end diff --git a/source/code/plugin/health/node_monitor_hierarchy_reducer.rb b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb new file mode 100644 index 000000000..aafbd07a8 --- /dev/null +++ b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +module HealthModel + class NodeMonitorHierarchyReducer + def initialize + end + + # Finalizes the Node Hierarchy. This removes node pools and node pool set from the hierarchy if they are not present. + def finalize(monitor_set) + monitors_to_reduce = [MonitorId::ALL_AGENT_NODE_POOLS, MonitorId::ALL_NODES] + # for the above monitors, which are constant per cluster, the monitor_id and monitor_instance_id are the same + monitors_to_reduce.each do |monitor_to_reduce| + monitor = monitor_set.get_monitor(monitor_to_reduce) + if !monitor.nil? + if monitor.is_aggregate_monitor && monitor.get_member_monitors.size == 1 + #copy the children of member monitor as children of parent + member_monitor_instance_id = monitor.get_member_monitors[0] #gets the only member monitor instance id + member_monitor = monitor_set.get_monitor(member_monitor_instance_id) + #reduce only if the aggregation algorithms are the same + if !member_monitor.aggregation_algorithm.nil? && member_monitor.aggregation_algorithm == AggregationAlgorithm::WORSTOF && monitor.aggregation_algorithm == member_monitor.aggregation_algorithm + member_monitor.get_member_monitors.each{|grandchild_monitor| + monitor.add_member_monitor(grandchild_monitor) + } + monitor.remove_member_monitor(member_monitor_instance_id) + # delete the member monitor from the monitor_set + monitor_set.delete(member_monitor_instance_id) + end + end + end + end + end + end +end diff --git a/source/code/plugin/health/parent_monitor_provider.rb b/source/code/plugin/health/parent_monitor_provider.rb new file mode 100644 index 000000000..6a27f11d8 --- /dev/null +++ b/source/code/plugin/health/parent_monitor_provider.rb @@ -0,0 +1,86 @@ +module HealthModel + class ParentMonitorProvider + + attr_reader :health_model_definition, :parent_monitor_mapping, :parent_monitor_instance_mapping + + def initialize(definition) + @health_model_definition = definition + @parent_monitor_mapping = {} #monitorId --> parent_monitor_id mapping + @parent_monitor_instance_mapping = {} #child monitor id -- > parent monitor instance mapping. Used in instances when the node no longer exists and impossible to compute from kube api results + end + + # gets the parent monitor id given the state transition. It requires the monitor id and labels to determine the parent id + def get_parent_monitor_id(monitor) + monitor_id = monitor.monitor_id + + # cache the parent monitor id so it is not recomputed every time + if @parent_monitor_mapping.key?(monitor.monitor_instance_id) + return @parent_monitor_mapping[monitor.monitor_instance_id] + end + + if @health_model_definition.key?(monitor_id) + parent_monitor_id = @health_model_definition[monitor_id]['parent_monitor_id'] + # check parent_monitor_id is an array, then evaluate the conditions, else return the parent_monitor_id + if parent_monitor_id.is_a?(String) + @parent_monitor_mapping[monitor.monitor_instance_id] = parent_monitor_id + return parent_monitor_id + end + if parent_monitor_id.nil? + conditions = @health_model_definition[monitor_id]['conditions'] + if !conditions.nil? && conditions.is_a?(Array) + labels = monitor.labels + conditions.each{|condition| + left = "#{labels[condition['key']]}" + op = "#{condition['operator']}" + right = "#{condition['value']}" + cond = left.send(op.to_sym, right) + + if cond + @parent_monitor_mapping[monitor.monitor_instance_id] = condition['parent_id'] + return condition['parent_id'] + end + } + end + raise "Conditions were not met to determine the parent monitor id" if monitor_id != MonitorId::CLUSTER + end + else + raise "Invalid Monitor Id #{monitor_id} in get_parent_monitor_id" + end + end + + def get_parent_monitor_labels(monitor_id, monitor_labels, parent_monitor_id) + labels_to_copy = @health_model_definition[monitor_id]['labels'] + if labels_to_copy.nil? + return {} + end + parent_monitor_labels = {} + labels_to_copy.each{|label| + parent_monitor_labels[label] = monitor_labels[label] + } + return parent_monitor_labels + end + + def get_parent_monitor_config(parent_monitor_id) + return @health_model_definition[parent_monitor_id] + end + + def get_parent_monitor_instance_id(monitor_instance_id, parent_monitor_id, parent_monitor_labels) + if @parent_monitor_instance_mapping.key?(monitor_instance_id) + return @parent_monitor_instance_mapping[monitor_instance_id] + end + + labels = AggregateMonitorInstanceIdLabels.get_labels_for(parent_monitor_id) + if !labels.is_a?(Array) + raise "Expected #{labels} to be an Array for #{parent_monitor_id}" + end + values = labels.map{|label| parent_monitor_labels[label]} + if values.nil? || values.empty? || values.size == 0 + @parent_monitor_instance_mapping[monitor_instance_id] = parent_monitor_id + return parent_monitor_id + end + parent_monitor_instance_id = "#{parent_monitor_id}-#{values.join('-')}" + @parent_monitor_instance_mapping[monitor_instance_id] = parent_monitor_instance_id + return parent_monitor_instance_id + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb new file mode 100644 index 000000000..9af599321 --- /dev/null +++ b/source/code/plugin/health/unit_monitor.rb @@ -0,0 +1,26 @@ +require_relative 'health_model_constants' +require 'json' + +module HealthModel + class UnitMonitor + + attr_accessor :monitor_id, :monitor_instance_id, :state, :transition_date_time, :labels, :config, :details, :is_aggregate_monitor + + # constructor + def initialize(monitor_id, monitor_instance_id, state, transition_date_time, labels, config, details) + @monitor_id = monitor_id + @monitor_instance_id = monitor_instance_id + @transition_date_time = transition_date_time + @state = state + @labels = labels + @config = config + @details = details + @is_aggregate_monitor = false + end + + def get_member_monitors + return nil + end + + end +end \ No newline at end of file diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index f5f65f01b..1702877a2 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -2,6 +2,7 @@ # frozen_string_literal: true module Fluent + class CAdvisor_Perf_Input < Input Plugin.register_input("cadvisorperf", self) @@ -18,6 +19,8 @@ def initialize config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.api.cadvisorperf" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + config_param :nodehealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Node" + #config_param :containerhealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Container" def configure(conf) super @@ -51,11 +54,14 @@ def enumerate() record["DataType"] = "LINUX_PERF_BLOB" record["IPName"] = "LogManagement" eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end + #router.emit(@tag, time, record) if record + end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream + #router.emit_stream(@containerhealthtag, eventStream) if eventStream + router.emit_stream(@nodehealthtag, eventStream) if eventStream + @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 3a0e04c67..f177b62bf 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -67,7 +67,7 @@ def enumerate(eventList = nil) newEventQueryState.push(eventId) if !eventQueryState.empty? && eventQueryState.include?(eventId) next - end + end record["ObjectKind"] = items["involvedObject"]["kind"] record["Namespace"] = items["involvedObject"]["namespace"] record["Name"] = items["involvedObject"]["name"] @@ -94,12 +94,12 @@ def enumerate(eventList = nil) eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream - end + end writeEventQueryState(newEventQueryState) rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + end end def run_periodic diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb new file mode 100644 index 000000000..d9672da3b --- /dev/null +++ b/source/code/plugin/in_kube_health.rb @@ -0,0 +1,307 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "KubernetesApiClient" +require_relative "oms_common" +require_relative "omslog" +require_relative "ApplicationInsightsUtility" + +module Fluent + + Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } + class KubeHealthInput < Input + Plugin.register_input("kubehealth", self) + + config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' + + @@clusterCpuCapacity = 0.0 + @@clusterMemoryCapacity = 0.0 + + def initialize + super + require "yaml" + require "json" + + @@cluster_id = KubernetesApiClient.getClusterId + @resources = HealthKubernetesResources.instance + @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + end + + include HealthModel + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.api.KubeHealth.ReplicaSet" + + def configure(conf) + super + end + + def start + begin + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + + @@hmlog = HealthMonitorUtils.get_log_handle + @@clusterName = KubernetesApiClient.getClusterName + @@clusterRegion = KubernetesApiClient.getClusterRegion + cluster_capacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog) + @@clusterCpuCapacity = cluster_capacity[0] + @@clusterMemoryCapacity = cluster_capacity[1] + @@hmlog.info "Cluster CPU Capacity: #{@@clusterCpuCapacity} Memory Capacity: #{@@clusterMemoryCapacity}" + if @@cluster_health_model_enabled + ApplicationInsightsUtility.sendCustomEvent("in_kube_health Plugin Start", {}) + end + end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + begin + if !@@cluster_health_model_enabled + @@hmlog.info "Cluster Health Model disabled in in_kube_health" + return + end + + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + health_monitor_records = [] + eventStream = MultiEventStream.new + + #HealthMonitorUtils.refresh_kubernetes_api_data(@@hmlog, nil) + # we do this so that if the call fails, we get a response code/header etc. + node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes") + node_inventory = JSON.parse(node_inventory_response.body) + pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") + pod_inventory = JSON.parse(pod_inventory_response.body) + deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body) + + @resources.node_inventory = node_inventory + @resources.pod_inventory = pod_inventory + @resources.deployment_inventory = deployment_inventory + + if node_inventory_response.code.to_i != 200 + record = process_kube_api_up_monitor("fail", node_inventory_response) + health_monitor_records.push(record) if record + else + record = process_kube_api_up_monitor("pass", node_inventory_response) + health_monitor_records.push(record) if record + end + + if !pod_inventory.nil? + record = process_cpu_oversubscribed_monitor(pod_inventory) + health_monitor_records.push(record) if record + record = process_memory_oversubscribed_monitor(pod_inventory) + health_monitor_records.push(record) if record + pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(pod_inventory, deployment_inventory) + + system_pods = pods_ready_hash.select{|k,v| v['namespace'] == 'kube-system'} + workload_pods = pods_ready_hash.select{|k,v| v['namespace'] != 'kube-system'} + + system_pods_ready_percentage_records = process_pods_ready_percentage(system_pods, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + system_pods_ready_percentage_records.each do |record| + health_monitor_records.push(record) if record + end + + workload_pods_ready_percentage_records = process_pods_ready_percentage(workload_pods, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID) + workload_pods_ready_percentage_records.each do |record| + health_monitor_records.push(record) if record + end + else + hmlog.info "POD INVENTORY IS NIL" + end + + if !node_inventory.nil? + node_condition_records = process_node_condition_monitor(node_inventory) + node_condition_records.each do |record| + health_monitor_records.push(record) if record + end + else + hmlog.info "NODE INVENTORY IS NIL" + end + + health_monitor_records.each do |record| + eventStream.add(emitTime, record) + end + router.emit_stream(@tag, eventStream) if eventStream + rescue => errorStr + @@hmlog.warn("error in_kube_health: #{errorStr.to_s}") + @@hmlog.debug "backtrace Input #{errorStr.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def process_cpu_oversubscribed_monitor(pod_inventory) + timestamp = Time.now.utc.iso8601 + subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"cpu", @@clusterCpuCapacity) + state = subscription > @@clusterCpuCapacity ? "fail" : "pass" + #@@hmlog.debug "CPU Oversubscribed Monitor State : #{state}" + + #CPU + monitor_id = HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterCpuCapacity" => @@clusterCpuCapacity/1000000.to_f, "clusterCpuRequests" => subscription/1000000.to_f}} + # @@hmlog.info health_monitor_record + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id]) + #hmlog.info "Monitor Instance Id: #{monitor_instance_id}" + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + #@@hmlog.info "Successfully processed process_cpu_oversubscribed_monitor" + return health_record + end + + def process_memory_oversubscribed_monitor(pod_inventory) + timestamp = Time.now.utc.iso8601 + subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"memory", @@clusterMemoryCapacity) + state = subscription > @@clusterMemoryCapacity ? "fail" : "pass" + #@@hmlog.debug "Memory Oversubscribed Monitor State : #{state}" + + #CPU + monitor_id = HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterMemoryCapacity" => @@clusterMemoryCapacity.to_f, "clusterMemoryRequests" => subscription.to_f}} + hmlog = HealthMonitorUtils.get_log_handle + + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + #@@hmlog.info "Successfully processed process_memory_oversubscribed_monitor" + return health_record + end + + def process_kube_api_up_monitor(state, response) + timestamp = Time.now.utc.iso8601 + + monitor_id = HealthMonitorConstants::KUBE_API_STATUS + details = response.each_header.to_h + details['ResponseCode'] = response.code + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} + hmlog = HealthMonitorUtils.get_log_handle + #hmlog.info health_monitor_record + + monitor_instance_id = HealthMonitorConstants::KUBE_API_STATUS + #hmlog.info "Monitor Instance Id: #{monitor_instance_id}" + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + #@@hmlog.info "Successfully processed process_kube_api_up_monitor" + return health_record + end + + def process_pods_ready_percentage(pods_hash, config_monitor_id) + monitor_config = @provider.get_config(config_monitor_id) + hmlog = HealthMonitorUtils.get_log_handle + + records = [] + pods_hash.keys.each do |key| + workload_name = key + total_pods = pods_hash[workload_name]['totalPods'] + pods_ready = pods_hash[workload_name]['podsReady'] + namespace = pods_hash[workload_name]['namespace'] + workload_kind = pods_hash[workload_name]['kind'] + percent = pods_ready / total_pods * 100 + timestamp = Time.now.utc.iso8601 + + state = HealthMonitorUtils.compute_percentage_state((100-percent), monitor_config) + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"totalPods" => total_pods, "podsReady" => pods_ready, "workloadName" => workload_name, "namespace" => namespace, "workloadKind" => workload_kind}} + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(config_monitor_id, [@@cluster_id, namespace, workload_name]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = config_monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + records.push(health_record) + end + #@@hmlog.info "Successfully processed pods_ready_percentage for #{config_monitor_id} #{records.size}" + return records + end + + def process_node_condition_monitor(node_inventory) + monitor_id = HealthMonitorConstants::NODE_CONDITION_MONITOR_ID + timestamp = Time.now.utc.iso8601 + monitor_config = @provider.get_config(monitor_id) + node_condition_monitor_records = [] + if !node_inventory.nil? + node_inventory['items'].each do |node| + node_name = node['metadata']['name'] + conditions = node['status']['conditions'] + state = HealthMonitorUtils.get_node_state_from_node_conditions(conditions) + #hmlog.debug "Node Name = #{node_name} State = #{state}" + details = {} + conditions.each do |condition| + details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message']} + end + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id, node_name]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + health_record[HealthMonitorRecordFields::NODE_NAME] = node_name + node_condition_monitor_records.push(health_record) + end + end + #@@hmlog.info "Successfully processed process_node_condition_monitor #{node_condition_monitor_records.size}" + return node_condition_monitor_records + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + @@hmlog.info("in_kube_health::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + @@hmlog.warn "in_kube_health::run_periodic: enumerate Failed for kubeapi sourced data health: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + end +end diff --git a/test/code/plugin/filter_health_model_builder_test.rb b/test/code/plugin/filter_health_model_builder_test.rb new file mode 100644 index 000000000..f4dba11ed --- /dev/null +++ b/test/code/plugin/filter_health_model_builder_test.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require 'test/unit' +require 'json' +# require_relative '../../../source/code/plugin/health' + +Dir[File.join(__dir__, '../../../source/code/plugin/health', '*.rb')].each { |file| require file } + +class FilterHealthModelBuilderTest < Test::Unit::TestCase + include HealthModel + + def test_event_stream + health_definition_path = 'C:\AzureMonitor\ContainerInsights\Docker-Provider\installer\conf\health_model_definition.json' + health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + monitor_factory = MonitorFactory.new + hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + state_finalizers = [AggregateMonitorStateFinalizer.new] + monitor_set = MonitorSet.new + model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + i = 1 + loop do + mock_data_path = "C:/AzureMonitor/ContainerInsights/Docker-Provider/source/code/plugin/mock_data-#{i}.json" + file = File.read(mock_data_path) + data = JSON.parse(file) + + health_monitor_records = [] + data.each do |record| + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + record[HealthMonitorRecordFields::MONITOR_LABELS], + record[HealthMonitorRecordFields::MONITOR_CONFIG], + record[HealthMonitorRecordFields::DETAILS] + ) + state_transitions.push(state_transition) + end + + model_builder.process_state_transitions(state_transitions) + changed_monitors = model_builder.finalize_model + changed_monitors.keys.each{|key| + puts key + } + i = i + 1 + if i == 6 + break + end + end + puts "Done" + end +end diff --git a/test/code/plugin/health/aggregate_monitor_spec.rb b/test/code/plugin/health/aggregate_monitor_spec.rb new file mode 100644 index 000000000..729965999 --- /dev/null +++ b/test/code/plugin/health/aggregate_monitor_spec.rb @@ -0,0 +1,256 @@ +require_relative '../test_helpers' + +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "AggregateMonitor Spec" do + it "is_aggregate_monitor is true for AggregateMonitor" do + # Arrange/Act + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + # Assert + assert_equal monitor.is_aggregate_monitor, true + end + + it "add_member_monitor tests -- adds a member monitor as a child monitor" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + #Act + monitor.add_member_monitor("child_monitor_1") + #Assert + assert_equal monitor.get_member_monitors.include?("child_monitor_1"), true + + #Act + monitor.add_member_monitor("child_monitor_1") + #Assert + assert_equal monitor.get_member_monitors.size, 1 + end + + it "remove_member_monitor tests -- removes a member monitor as a child monitor" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + + #Act + monitor.remove_member_monitor("child_monitor_1") + #Assert + assert_equal monitor.get_member_monitors.size, 1 + + #Act + monitor.remove_member_monitor("unknown_child") + #Assert + assert_equal monitor.get_member_monitors.size, 1 + end + + it "calculate_details tests -- calculates rollup details based on member monitor states" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + + #Act + monitor.calculate_details(monitor_set) + #Assert + assert_equal monitor.details["details"], {"pass"=>["child_monitor_1"], "fail"=>["child_monitor_2"]} + + #Arrange + child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "pass", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_3) + monitor.add_member_monitor("child_monitor_3") + + #Act + monitor.calculate_details(monitor_set) + #Assert + assert_equal monitor.details["details"], {"pass"=>["child_monitor_1", "child_monitor_3"], "fail"=>["child_monitor_2"]} + end + + it "calculate_state tests -- raises when right aggregation_algorithm NOT specified" do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "", [], {}) + #Assert + assert_raises do + monitor.calculate_state(monitor_set) + end + end + + it "calculate_state tests -- calculate_worst_of_state " do + # Arrange -- pass, fail = fail + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "worstOf", [], {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + #Arrange -- pass, pass = pass + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "pass", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_2) + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + #Arrange -- pass, warn = warn + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "warn", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_2) + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "warn" + + #Arrange -- warn, fail = fail + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "warn", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + #Arrange -- warn, unknown = unknown + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "warn", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "unknown", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "warn" + + #Arrange -- pass, unknown = unknown + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "unknown", "time", {}, {}, {}) + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "unknown" + end + + it "calculate_state tests -- calculate_percentage_state " do + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 90.0}, {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + #Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 50.0}, {}) + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + #Arrange -- single child monitor + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 33.3}, {}) + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor.add_member_monitor("child_monitor_1") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + + #Arrange -- remove none state + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :none, :time, "percentage", {"state_threshold" => 100.0}, {}) + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "none", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + + + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 50.0}, {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "fail", "time", {}, {}, {}) + child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "fail", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + monitor_set.add_or_update(child_monitor_3) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + monitor.add_member_monitor("child_monitor_3") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "fail" + + + # Arrange + monitor = AggregateMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, "percentage", {"state_threshold" => 90.0}, {}) + + child_monitor_1 = UnitMonitor.new("monitor_1", "child_monitor_1", "pass", "time", {}, {}, {}) + child_monitor_2 = UnitMonitor.new("monitor_2", "child_monitor_2", "pass", "time", {}, {}, {}) + child_monitor_3 = UnitMonitor.new("monitor_3", "child_monitor_3", "pass", "time", {}, {}, {}) + + monitor_set = MonitorSet.new + monitor_set.add_or_update(child_monitor_1) + monitor_set.add_or_update(child_monitor_2) + monitor_set.add_or_update(child_monitor_3) + + monitor.add_member_monitor("child_monitor_1") + monitor.add_member_monitor("child_monitor_2") + monitor.add_member_monitor("child_monitor_3") + #Act + monitor.calculate_state(monitor_set) + #Assert + assert_equal monitor.state, "pass" + end +end \ No newline at end of file diff --git a/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb b/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb new file mode 100644 index 000000000..f1ae0564d --- /dev/null +++ b/test/code/plugin/health/aggregate_monitor_state_finalizer_spec.rb @@ -0,0 +1,59 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "AggregateMonitorStateFinalizer spec" do + it 'computes the right state and details' do + #arrange + monitor_set = Mock.new + + #mock unit monitors + child1 = Mock.new + def child1.state; "pass"; end + def child1.monitor_id; "child1";end + def child1.monitor_instance_id; "child1"; end + def child1.nil?; false; end + def child1.is_aggregate_monitor; false; end + + child2 = Mock.new + def child2.state; "fail"; end + def child2.monitor_id; "child2";end + def child2.monitor_instance_id; "child2"; end + def child2.nil?; false; end + def child2.is_aggregate_monitor; false; end + + parent_monitor = AggregateMonitor.new("parent_monitor", "parent_monitor", :none, :time, "worstOf", nil, {}) + parent_monitor.add_member_monitor("child1") + parent_monitor.add_member_monitor("child2") + + top_level_monitor = AggregateMonitor.new("cluster", "cluster", :none, :time, "worstOf", nil, {}) + top_level_monitor.add_member_monitor("parent_monitor") + + monitor_set.expect(:get_map, {"cluster" => top_level_monitor, "parent_monitor" => parent_monitor, "child1" => child1, "child2" => child2}) + monitor_set.expect(:get_monitor, top_level_monitor, ["cluster"]) + monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"]) + monitor_set.expect(:get_monitor, child1, ["child1"]) + monitor_set.expect(:get_monitor, child2, ["child2"]) + monitor_set.expect(:get_monitor, child1, ["child1"]) + monitor_set.expect(:get_monitor, child2, ["child2"]) + monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"]) + + + monitor_set.expect(:get_monitor, parent_monitor, ["parent_monitor"]) + monitor_set.expect(:get_monitor, child1, ["child1"]) + monitor_set.expect(:get_monitor, child2, ["child2"]) + + #act + finalizer = AggregateMonitorStateFinalizer.new + finalizer.finalize(monitor_set) + #assert + + assert_equal parent_monitor.state, "fail" + assert_equal parent_monitor.details, {"details"=>{"pass"=>["child1"], "fail"=>["child2"]}, "state"=>"fail", "timestamp"=>:time} + + assert_equal top_level_monitor.state, "fail" + assert_equal top_level_monitor.details, {"details"=>{"fail"=>["parent_monitor"]}, "state"=>"fail", "timestamp"=>:time} + + end +end \ No newline at end of file diff --git a/test/code/plugin/health/ca.crt b/test/code/plugin/health/ca.crt new file mode 100644 index 000000000..9daeafb98 --- /dev/null +++ b/test/code/plugin/health/ca.crt @@ -0,0 +1 @@ +test diff --git a/test/code/plugin/health/cluster_health_state_spec.rb b/test/code/plugin/health/cluster_health_state_spec.rb new file mode 100644 index 000000000..897291fe2 --- /dev/null +++ b/test/code/plugin/health/cluster_health_state_spec.rb @@ -0,0 +1,37 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +require 'time' +include HealthModel +include Minitest + +describe "Cluster Health State Spec" do + + it "ClusterHealthState.new throws if cert file is NOT present" do + state = { + "m1" => { + "state" => "pass", + "time" => Time.now.utc.iso8601 + } + } + + token_file_path = 'token' + cert_file_path = '/var/ca.crt' + + proc {ClusterHealthState.new(token_file_path, cert_file_path)}.must_raise + + end + + it "ClusterHealthState.new returns nil if token is NOT present" do + state = { + "m1" => { + "state" => "pass", + "time" => Time.now.utc.iso8601 + } + } + token_file_path = 'token' + cert_file_path = File.join(File.expand_path(File.dirname(__FILE__)), "ca.crt") + + chs = ClusterHealthState.new(token_file_path, cert_file_path) + chs.token.must_be_nil + end +end diff --git a/test/code/plugin/health/health_hierarchy_builder_spec.rb b/test/code/plugin/health/health_hierarchy_builder_spec.rb new file mode 100644 index 000000000..daafe0312 --- /dev/null +++ b/test/code/plugin/health/health_hierarchy_builder_spec.rb @@ -0,0 +1,11 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthHierarchyBuilder spec" do + it 'builds right hierarchy given a child monitor and a parent monitor provider' do + + end + +end \ No newline at end of file diff --git a/test/code/plugin/health/health_kubernetes_resource_spec.rb b/test/code/plugin/health/health_kubernetes_resource_spec.rb new file mode 100644 index 000000000..c27d969ec --- /dev/null +++ b/test/code/plugin/health/health_kubernetes_resource_spec.rb @@ -0,0 +1,222 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "HealthKubernetesResources spec" do + it "returns the right set of nodes and workloads given node and pod inventory" do + + #arrange + nodes_json = '{ + "items": [ + { + "metadata": { + "name": "aks-nodepool1-19574989-0" + } + }, + { + "metadata": { + "name": "aks-nodepool1-19574989-1" + } + } + ] + }' + + pods_json = '{ + "items": [ + { + "metadata": { + "name": "diliprdeploymentnodeapps-c4fdfb446-mzcsr", + "generateName": "diliprdeploymentnodeapps-c4fdfb446-", + "namespace": "default", + "selfLink": "/api/v1/namespaces/default/pods/diliprdeploymentnodeapps-c4fdfb446-mzcsr", + "uid": "ee31a9ce-526e-11e9-a899-6a5520730c61", + "resourceVersion": "4597573", + "creationTimestamp": "2019-03-29T22:06:40Z", + "labels": { + "app": "diliprsnodeapppod", + "diliprPodLabel1": "p1", + "diliprPodLabel2": "p2", + "pod-template-hash": "709896002" + }, + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "kind": "ReplicaSet", + "name": "diliprdeploymentnodeapps-c4fdfb446", + "uid": "ee1e78e0-526e-11e9-a899-6a5520730c61", + "controller": true, + "blockOwnerDeletion": true + } + ] + }, + "apiVersion": "v1", + "kind": "Pod" + }, + { + "metadata": { + "name": "pi-m8ccw", + "generateName": "pi-", + "namespace": "default", + "selfLink": "/api/v1/namespaces/default/pods/pi-m8ccw", + "uid": "9fb16aaa-7ccc-11e9-8d23-32c49ee6f300", + "resourceVersion": "7940877", + "creationTimestamp": "2019-05-22T20:03:10Z", + "labels": { + "controller-uid": "9fad836f-7ccc-11e9-8d23-32c49ee6f300", + "job-name": "pi" + }, + "ownerReferences": [ + { + "apiVersion": "batch/v1", + "kind": "Job", + "name": "pi", + "uid": "9fad836f-7ccc-11e9-8d23-32c49ee6f300", + "controller": true, + "blockOwnerDeletion": true + } + ] + }, + "apiVersion": "v1", + "kind": "Pod" + }, + { + "metadata": { + "name": "rss-site", + "namespace": "default", + "selfLink": "/api/v1/namespaces/default/pods/rss-site", + "uid": "68a34ea4-7ce4-11e9-8d23-32c49ee6f300", + "resourceVersion": "7954135", + "creationTimestamp": "2019-05-22T22:53:26Z", + "labels": { + "app": "web" + }, + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"metadata\":{\"annotations\":{},\"labels\":{\"app\":\"web\"},\"name\":\"rss-site\",\"namespace\":\"default\"},\"spec\":{\"containers\":[{\"image\":\"nginx\",\"name\":\"front-end\",\"ports\":[{\"containerPort\":80}]},{\"image\":\"nickchase/rss-php-nginx:v1\",\"name\":\"rss-reader\",\"ports\":[{\"containerPort\":88}]}]}}\n" + } + }, + "apiVersion": "v1", + "kind": "Pod" + }, + { + "metadata": { + "name": "kube-proxy-4hjws", + "generateName": "kube-proxy-", + "namespace": "kube-system", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-proxy-4hjws", + "uid": "8cf7c410-88f4-11e9-b1b0-5eb4a3e9de7d", + "resourceVersion": "9661065", + "creationTimestamp": "2019-06-07T07:19:12Z", + "labels": { + "component": "kube-proxy", + "controller-revision-hash": "1271944371", + "pod-template-generation": "16", + "tier": "node" + }, + "annotations": { + "aks.microsoft.com/release-time": "seconds:1559735217 nanos:797729016 ", + "remediator.aks.microsoft.com/kube-proxy-restart": "7" + }, + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "kind": "DaemonSet", + "name": "kube-proxy", + "uid": "45640bf6-44e5-11e9-9920-423525a6b683", + "controller": true, + "blockOwnerDeletion": true + } + ] + }, + "apiVersion": "v1", + "kind": "Pod" + } + ] + }' + deployments_json = '{ + "items": [ + { + "metadata": { + "name": "diliprdeploymentnodeapps", + "namespace": "default", + "selfLink": "/apis/extensions/v1beta1/namespaces/default/deployments/diliprdeploymentnodeapps", + "uid": "ee1b111d-526e-11e9-a899-6a5520730c61", + "resourceVersion": "4597575", + "generation": 1, + "creationTimestamp": "2019-03-29T22:06:40Z", + "labels": { + "diliprdeploymentLabel1": "d1", + "diliprdeploymentLabel2": "d2" + }, + "annotations": { + "deployment.kubernetes.io/revision": "1", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"diliprdeploymentLabel1\":\"d1\",\"diliprdeploymentLabel2\":\"d2\"},\"name\":\"diliprdeploymentnodeapps\",\"namespace\":\"default\"},\"spec\":{\"replicas\":1,\"selector\":{\"matchLabels\":{\"app\":\"diliprsnodeapppod\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"diliprsnodeapppod\",\"diliprPodLabel1\":\"p1\",\"diliprPodLabel2\":\"p2\"}},\"spec\":{\"containers\":[{\"image\":\"rdilip83/logeverysecond:v2\",\"name\":\"diliprcontainerhelloapp\"}]}}}}\n" + } + }, + "spec": { + "replicas": 1, + "selector": { + "matchLabels": { + "app": "diliprsnodeapppod" + } + }, + "template": { + "metadata": { + "creationTimestamp": null, + "labels": { + "app": "diliprsnodeapppod", + "diliprPodLabel1": "p1", + "diliprPodLabel2": "p2" + } + }, + "spec": { + "containers": [ + { + "name": "diliprcontainerhelloapp", + "image": "rdilip83/logeverysecond:v2", + "resources": {}, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "imagePullPolicy": "IfNotPresent" + } + ], + "restartPolicy": "Always", + "terminationGracePeriodSeconds": 30, + "dnsPolicy": "ClusterFirst", + "securityContext": {}, + "schedulerName": "default-scheduler" + } + }, + "strategy": { + "type": "RollingUpdate", + "rollingUpdate": { + "maxUnavailable": "25%", + "maxSurge": "25%" + } + }, + "revisionHistoryLimit": 2, + "progressDeadlineSeconds": 600 + }, + "apiVersion": "extensions/v1beta1", + "kind": "Deployment" + } + ] + }' + nodes = JSON.parse(nodes_json) + pods = JSON.parse(pods_json) + deployments = JSON.parse(deployments_json) + resources = HealthKubernetesResources.instance + resources.node_inventory = nodes + resources.pod_inventory = pods + resources.deployment_inventory = deployments + #act + parsed_nodes = resources.get_nodes + parsed_workloads = resources.get_workload_names + + #assert + assert_equal parsed_nodes.size, 2 + assert_equal parsed_workloads.size, 3 + + assert_equal parsed_nodes, ['aks-nodepool1-19574989-0', 'aks-nodepool1-19574989-1'] + assert_equal parsed_workloads, ['default~~diliprdeploymentnodeapps', 'default~~rss-site', 'kube-system~~kube-proxy'] + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_missing_signal_generator_spec.rb b/test/code/plugin/health/health_missing_signal_generator_spec.rb new file mode 100644 index 000000000..98d65416d --- /dev/null +++ b/test/code/plugin/health/health_missing_signal_generator_spec.rb @@ -0,0 +1,79 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each {|file| require file} +include HealthModel +include Minitest + +describe "HealthMissingSignalGenerator spec" do + it 'generates missing node signals' do + #arrange + resources = Mock.new + resources.expect(:get_nodes, ["node1"]) + resources.expect(:get_workload_names, ["default~~workload1"]) + + provider = Mock.new + provider.expect(:get_node_labels, {HealthMonitorLabels::HOSTNAME => "node1"}, ["node1"]) + + node1_cpu_record = Mock.new + def node1_cpu_record.monitor_id; "node_cpu_utilization"; end + def node1_cpu_record.monitor_instance_id; "node_cpu_utilization"; end + def node1_cpu_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def node1_cpu_record.config; {}; end + def node1_cpu_record.state; "pass"; end + + node1_memory_record = Mock.new + def node1_memory_record.monitor_id; "node_memory_utilization"; end + def node1_memory_record.monitor_instance_id; "node_memory_utilization"; end + def node1_memory_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def node1_memory_record.config; {}; end + def node1_memory_record.state; "pass"; end + + node1_condition_record = Mock.new + def node1_condition_record.monitor_id; "node_condition"; end + def node1_condition_record.monitor_instance_id; "node_condition-0c593682737a955dc8e0947ad12754fe"; end + def node1_condition_record.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def node1_condition_record.config; {}; end + def node1_condition_record.state; "pass"; end + + + workload1_pods_ready_record = Mock.new + def workload1_pods_ready_record.monitor_id; "user_workload_pods_ready"; end + def workload1_pods_ready_record.monitor_instance_id; "user_workload_pods_ready-workload1"; end + def workload1_pods_ready_record.labels; {HealthMonitorLabels::NAMESPACE => "default", HealthMonitorLabels::WORKLOAD_NAME => "workload1"}; end + def workload1_pods_ready_record.config; {}; end + def workload1_pods_ready_record.state; "pass"; end + + generator = HealthMissingSignalGenerator.new + generator.update_last_received_records([node1_cpu_record, node1_memory_record, node1_condition_record, workload1_pods_ready_record]) + + #act + missing = generator.get_missing_signals('fake_cluster_id', [node1_cpu_record, node1_memory_record], resources, provider) + + #assert + assert_equal missing.size, 2 + + assert_equal missing[0].monitor_id, "node_condition" + assert_equal missing[0].state, "unknown" + assert_equal missing[0].monitor_instance_id, "node_condition-0c593682737a955dc8e0947ad12754fe" + + assert_equal missing[1].monitor_id, "user_workload_pods_ready" + assert_equal missing[1].state, "unknown" + assert_equal missing[1].monitor_instance_id, "user_workload_pods_ready-workload1" + + #arrange + resources.expect(:get_nodes, ["node1"]) + resources.expect(:get_workload_names, ["default~~workload1"]) + provider.expect(:get_node_labels, {HealthMonitorLabels::HOSTNAME => "node1"}, ["node1"]) + generator.update_last_received_records([node1_cpu_record, node1_memory_record]) + #act + missing = generator.get_missing_signals('fake_cluster_id', [node1_cpu_record, node1_memory_record], resources, provider) + #assert + assert_equal missing.size, 2 + assert_equal missing[0].monitor_id, "node_condition" + assert_equal missing[0].state, "unknown" + assert_equal missing[0].monitor_instance_id, "node_condition-0c593682737a955dc8e0947ad12754fe" + + assert_equal missing[1].monitor_id, "user_workload_pods_ready" + assert_equal missing[1].state, "none" + assert_equal missing[1].monitor_instance_id, "user_workload_pods_ready-workload1" + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_buffer_spec.rb b/test/code/plugin/health/health_model_buffer_spec.rb new file mode 100644 index 000000000..259513c08 --- /dev/null +++ b/test/code/plugin/health/health_model_buffer_spec.rb @@ -0,0 +1,25 @@ +require_relative '../../../../source/code/plugin/health/health_model_buffer' +require_relative '../test_helpers' + +include HealthModel + +describe "HealthModelBuffer Spec" do + it "get_buffer returns the correct buffer data" do + # Arrange + buffer = HealthModelBuffer.new + # Act + buffer.add_to_buffer(['mockRecord']) + # Assert + assert_equal buffer.get_buffer.length, 1 + + #Act + buffer.add_to_buffer(['mockRecord1', 'mockRecord2']) + #Assert + assert_equal buffer.get_buffer.length, 3 + + #Act + buffer.reset_buffer + #Assert + assert_equal buffer.get_buffer.length, 0 + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_builder_spec.rb b/test/code/plugin/health/health_model_builder_spec.rb new file mode 100644 index 000000000..c49e6c92a --- /dev/null +++ b/test/code/plugin/health/health_model_builder_spec.rb @@ -0,0 +1,37 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthModelBuilder spec" do + it "Verify hierarchy builder and finalizer public methods are called" do + #arrange + mock_hierarchy_builder = Mock::new + health_record = Mock::new + mock_monitor_set = Mock::new + mock_state_finalizer = Mock::new + mock_hierarchy_builder.expect(:process_record, nil, [health_record, mock_monitor_set]) + mock_state_finalizer.expect(:finalize, {}, [mock_monitor_set]) + def mock_monitor_set.get_map; {}; end + + #act + builder = HealthModelBuilder.new(mock_hierarchy_builder, [mock_state_finalizer], mock_monitor_set) + builder.process_records([health_record]) + builder.finalize_model + #assert + assert mock_hierarchy_builder.verify + assert mock_state_finalizer.verify + end + + it "Verify finalize_model raises if state_finalizers is empty" do + #arrange + mock_hierarchy_builder = Mock.new + mock_monitor_set = Mock.new + builder = HealthModelBuilder.new(mock_hierarchy_builder, [], mock_monitor_set) + #act and assert + assert_raises do + builder.finalize_model + end + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_builder_test.rb b/test/code/plugin/health/health_model_builder_test.rb new file mode 100644 index 000000000..df921049c --- /dev/null +++ b/test/code/plugin/health/health_model_builder_test.rb @@ -0,0 +1,337 @@ +require 'test/unit' +require 'json' +# require_relative '../../../source/code/plugin/health' + +Dir[File.join(__dir__, '../../../../source/code/plugin/health', '*.rb')].each { |file| require file } + +class FilterHealthModelBuilderTest < Test::Unit::TestCase + include HealthModel + + def test_event_stream + #setup + health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') + health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + monitor_factory = MonitorFactory.new + hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + state_finalizers = [AggregateMonitorStateFinalizer.new] + monitor_set = MonitorSet.new + model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + nodes_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + } + + pods_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + } + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + + cluster_id = 'fake_cluster_id' + + #test + state = HealthMonitorState.new() + generator = HealthMissingSignalGenerator.new + + for scenario in ["first", "second", "third"] + mock_data_path = File.join(__dir__, "../../../../health_records/#{scenario}_daemon_set_signals.json") + file = File.read(mock_data_path) + records = JSON.parse(file) + + node_inventory = JSON.parse(File.read(nodes_file_map[scenario])) + pod_inventory = JSON.parse(File.read(pods_file_map[scenario])) + deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/deployments.json"))) + resources = HealthKubernetesResources.instance + resources.node_inventory = node_inventory + resources.pod_inventory = pod_inventory + resources.deployment_inventory = deployment_inventory + + workload_names = resources.get_workload_names + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../..//installer/conf/healthmonitorconfig.json")) + + health_monitor_records = [] + records.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + provider.get_labels(record), + provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + + state.update_state(health_monitor_record, + provider.get_config(health_monitor_record.monitor_id) + ) + + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + health_monitor_record.state = state.get_state(monitor_instance_id).new_state + health_monitor_records.push(health_monitor_record) + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + + #handle kube api down + kube_api_down_handler = HealthKubeApiDownHandler.new + health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + reducer = HealthSignalReducer.new() + reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + cluster_id = 'fake_cluster_id' + + #get the list of 'none' and 'unknown' signals + missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + #update state for missing signals + missing_signals.each{|signal| + state.update_state(signal, + provider.get_config(signal.monitor_id) + ) + } + generator.update_last_received_records(reduced_records) + reduced_records.push(*missing_signals) + + # build the health model + all_records = reduced_records + model_builder.process_records(all_records) + all_monitors = model_builder.finalize_model + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each{|monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + state.update_state(monitor, + provider.get_config(monitor.monitor_id) + ) + end + + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send + + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + records_to_send = [] + all_monitors.keys.each{|key| + record = provider.get_record(all_monitors[key], state) + #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + } + + if scenario == "first" + assert_equal 50, all_monitors.size + elsif scenario == "second" + assert_equal 34, all_monitors.size + elsif scenario == "third" + assert_equal 5, all_monitors.size + end + # for each key in monitor.keys, + # get the state from health_monitor_state + # generate the record to send + serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + serializer.serialize(state) + + deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + deserialized_state = deserializer.deserialize + + after_state = HealthMonitorState.new + after_state.initialize_state(deserialized_state) + end + end + + def test_event_stream_aks_engine + + #setup + health_definition_path = File.join(__dir__, '../../../../installer\conf\health_model_definition.json') + health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + monitor_factory = MonitorFactory.new + hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + state_finalizers = [AggregateMonitorStateFinalizer.new] + monitor_set = MonitorSet.new + model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + nodes_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + } + + pods_file_map = { + #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + } + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'aks-engine-health', + 'container.azm.ms/cluster-name' => 'aks-engine-health' + } + + cluster_id = 'fake_cluster_id' + + #test + state = HealthMonitorState.new() + generator = HealthMissingSignalGenerator.new + + for scenario in 1..3 + mock_data_path = File.join(__dir__, "../../../../health_records/aks-engine/aks-engine-#{scenario}.json") + file = File.read(mock_data_path) + records = JSON.parse(file) + + node_inventory = JSON.parse(File.read(nodes_file_map["aks-engine-#{scenario}"])) + pod_inventory = JSON.parse(File.read(pods_file_map["aks-engine-#{scenario}"])) + deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/aks-engine/deployments.json"))) + resources = HealthKubernetesResources.instance + resources.node_inventory = node_inventory + resources.pod_inventory = pod_inventory + resources.deployment_inventory = deployment_inventory + + workload_names = resources.get_workload_names + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + health_monitor_records = [] + records.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + provider.get_labels(record), + provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + + state.update_state(health_monitor_record, + provider.get_config(health_monitor_record.monitor_id) + ) + + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + health_monitor_record.state = state.get_state(monitor_instance_id).new_state + health_monitor_records.push(health_monitor_record) + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + + #handle kube api down + kube_api_down_handler = HealthKubeApiDownHandler.new + health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + reducer = HealthSignalReducer.new() + reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + cluster_id = 'fake_cluster_id' + + #get the list of 'none' and 'unknown' signals + missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + #update state for missing signals + missing_signals.each{|signal| + state.update_state(signal, + provider.get_config(signal.monitor_id) + ) + } + generator.update_last_received_records(reduced_records) + reduced_records.push(*missing_signals) + + # build the health model + all_records = reduced_records + model_builder.process_records(all_records) + all_monitors = model_builder.finalize_model + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each{|monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + state.update_state(monitor, + provider.get_config(monitor.monitor_id) + ) + end + + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send + + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + records_to_send = [] + all_monitors.keys.each{|key| + record = provider.get_record(all_monitors[key], state) + #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + } + + if scenario == 1 + assert_equal 58, all_monitors.size + elsif scenario == 2 + assert_equal 37, all_monitors.size + elsif scenario == 3 + assert_equal 6, all_monitors.size + end + # for each key in monitor.keys, + # get the state from health_monitor_state + # generate the record to send + serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + serializer.serialize(state) + + deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + deserialized_state = deserializer.deserialize + + after_state = HealthMonitorState.new + after_state.initialize_state(deserialized_state) + end + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_definition_parser_spec.rb b/test/code/plugin/health/health_model_definition_parser_spec.rb new file mode 100644 index 000000000..56551510b --- /dev/null +++ b/test/code/plugin/health/health_model_definition_parser_spec.rb @@ -0,0 +1,24 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "HealthModelDefinitionParser spec " do + it "parses the definition file correctly with the right conditions" do + #arrange + + parser = HealthModelDefinitionParser.new(File.join(File.expand_path(File.dirname(__FILE__)), 'test_health_model_definition.json')) + #act + model_definition = parser.parse_file + + #assert + assert_equal model_definition['conditional_monitor_id'].key?("conditions"), true + assert_equal model_definition['conditional_monitor_id']["conditions"].size, 2 + assert_equal model_definition['conditional_monitor_id'].key?("parent_monitor_id"), false + + #assert + assert_equal model_definition['monitor_id'].key?("conditions"), false + assert_equal model_definition['monitor_id'].key?("parent_monitor_id"), true + end + +end \ No newline at end of file diff --git a/test/code/plugin/health/health_monitor_state_spec.rb b/test/code/plugin/health/health_monitor_state_spec.rb new file mode 100644 index 000000000..5fa8a6c6e --- /dev/null +++ b/test/code/plugin/health/health_monitor_state_spec.rb @@ -0,0 +1,176 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthMonitorState spec" do + it 'updates should_send to true for monitors which hasnt been sent before' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + #act + state.update_state(mock_monitor, {}) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + end + + it 'updates should_send to true for monitors which need no consistent state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + #act + state.update_state(mock_monitor, {}) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + + #arrange + def mock_monitor.state; "fail"; end + def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, {}) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "pass" + monitor_state.new_state.must_equal "fail" + end + + it 'updates should_send to false for monitors which need consistent state change and has no consistent state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + config = JSON.parse('{ + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }') + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + + #arrange + def mock_monitor.state; "fail"; end + def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + end + + it 'updates should_send to true for monitors which need consistent state change and has a consistent state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + config = JSON.parse('{ + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }') + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + + #arrange + def mock_monitor.state; "fail"; end + def mock_monitor.details; {"state" => "fail", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + + #act + state.update_state(mock_monitor, config) + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "fail" + end + + it 'updates should_send to false for monitors which need consistent state change and has NO state change' do + #arrange + state = HealthMonitorState.new + mock_monitor = Mock.new + def mock_monitor.state; "pass"; end + def mock_monitor.monitor_id; "monitor_id"; end + def mock_monitor.monitor_instance_id; "monitor_instance_id"; end + def mock_monitor.transition_date_time; Time.now.utc.iso8601; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + + config = JSON.parse('{ + "WarnThresholdPercentage": 80.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 + }') + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "none" + + + #arrange + def mock_monitor.state; "pass"; end + def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + + #act + state.update_state(mock_monitor, config) + monitor_state.should_send.must_equal true + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + + #act + state.update_state(mock_monitor, config) + monitor_state = state.get_state("monitor_instance_id") + #assert + monitor_state.should_send.must_equal false + monitor_state.old_state.must_equal "none" + monitor_state.new_state.must_equal "pass" + end + +end \ No newline at end of file diff --git a/test/code/plugin/health/health_signal_reducer_spec.rb b/test/code/plugin/health/health_signal_reducer_spec.rb new file mode 100644 index 000000000..f71a5c509 --- /dev/null +++ b/test/code/plugin/health/health_signal_reducer_spec.rb @@ -0,0 +1,96 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthSignalReducer spec" do + it "returns the right set of records -- no reduction" do + #arrange + record1 = Mock.new + def record1.monitor_id; "node_cpu_utilization"; end + def record1.monitor_instance_id; "node_cpu_utilization-node1"; end + def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + inventory = Mock.new + def inventory.get_nodes; ["node1"]; end + def inventory.get_workload_names; []; end + reducer = HealthSignalReducer.new + #act + reduced = reducer.reduce_signals([record1], inventory) + #Assert + assert_equal reduced.size, 1 + end + + it "returns only the latest record if multiple records are present for the same monitor" do + #arrange + record1 = Mock.new + def record1.monitor_id; "node_cpu_utilization"; end + def record1.monitor_instance_id; "node_cpu_utilization-node1"; end + def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def record1.transition_date_time; Time.now.utc.iso8601 ; end + + + record2 = Mock.new + def record2.monitor_id; "node_cpu_utilization"; end + def record2.monitor_instance_id; "node_cpu_utilization-node1"; end + def record2.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + def record2.transition_date_time; "#{Time.now.utc.iso8601}" ; end + + inventory = Mock.new + def inventory.get_nodes; ["node1"]; end + def inventory.get_workload_names; []; end + reducer = HealthSignalReducer.new + #act + reduced = reducer.reduce_signals([record1, record2], inventory) + #Assert + assert_equal reduced.size, 1 + end + + it "returns only those records if the node is present in the inventory" do + #arrange + record1 = Mock.new + def record1.monitor_id; "node_cpu_utilization"; end + def record1.monitor_instance_id; "node_cpu_utilization-node1"; end + def record1.labels; {HealthMonitorLabels::HOSTNAME => "node1"}; end + inventory = Mock.new + def inventory.get_nodes; ["node2"]; end + def inventory.get_workload_names; []; end + + #act + reducer = HealthSignalReducer.new + #assert + assert_equal reducer.reduce_signals([record1], inventory).size, 0 + end + + it "returns only those records if the workdload name is present in the inventory" do + #arrange + record1 = Mock.new + def record1.monitor_id; "user_workload_pods_ready"; end + def record1.monitor_instance_id; "user_workload_pods_ready-workload1"; end + def record1.labels; {HealthMonitorLabels::NAMESPACE => "default", HealthMonitorLabels::WORKLOAD_NAME => "workload1"}; end + def record1.transition_date_time; Time.now.utc.iso8601 ; end + + inventory = Mock.new + def inventory.get_nodes; ["node2"]; end + def inventory.get_workload_names; ["default~~workload1"]; end + reducer = HealthSignalReducer.new + + #act + reduced = reducer.reduce_signals([record1], inventory) + + #assert + assert_equal reduced.size, 1 + + #arrange + record2 = Mock.new + def record2.monitor_id; "user_workload_pods_ready"; end + def record2.monitor_instance_id; "user_workload_pods_ready-workload2"; end + def record2.labels; {HealthMonitorLabels::NAMESPACE => "default1", HealthMonitorLabels::WORKLOAD_NAME => "workload2"}; end + def record1.transition_date_time; Time.now.utc.iso8601 ; end + #act + reduced = reducer.reduce_signals([record1, record2], inventory) + #assert + assert_equal reduced.size, 1 + end + +end diff --git a/test/code/plugin/health/kube_api_down_handler_spec.rb b/test/code/plugin/health/kube_api_down_handler_spec.rb new file mode 100644 index 000000000..3f3f9b37f --- /dev/null +++ b/test/code/plugin/health/kube_api_down_handler_spec.rb @@ -0,0 +1,26 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "KubeApiDownHandler spec" do + it "updates states for monitors in monitors_to_change" do + #arrange + record1 = HealthMonitorRecord.new("node_condition", "node_condition-node1", Time.now.utc.iso8601, "pass", {}, {}, {}) + record2 = HealthMonitorRecord.new("kube_api_status", "kube_api_status", Time.now.utc.iso8601, "fail", {}, {}, {}) + record3 = HealthMonitorRecord.new("user_workload_pods_ready", "user_workload_pods_ready-workload1", Time.now.utc.iso8601, "pass", {}, {}, {}) + record4 = HealthMonitorRecord.new("system_workload_pods_ready", "system_workload_pods_ready-workload2", Time.now.utc.iso8601, "pass", {}, {}, {}) + record5 = HealthMonitorRecord.new("subscribed_capacity_cpu", "subscribed_capacity_cpu", Time.now.utc.iso8601, "pass", {}, {}, {}) + record6 = HealthMonitorRecord.new("subscribed_capacity_memory", "subscribed_capacity_memory", Time.now.utc.iso8601, "pass", {}, {}, {}) + handler = HealthKubeApiDownHandler.new + + #act + handler.handle_kube_api_down([record1, record2, record3, record4, record5, record6]) + #assert + assert_equal record1.state, HealthMonitorStates::UNKNOWN + assert_equal record3.state, HealthMonitorStates::UNKNOWN + assert_equal record4.state, HealthMonitorStates::UNKNOWN + assert_equal record5.state, HealthMonitorStates::UNKNOWN + assert_equal record6.state, HealthMonitorStates::UNKNOWN + + end +end diff --git a/test/code/plugin/health/monitor_factory_spec.rb b/test/code/plugin/health/monitor_factory_spec.rb new file mode 100644 index 000000000..2135808bd --- /dev/null +++ b/test/code/plugin/health/monitor_factory_spec.rb @@ -0,0 +1,28 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "MonitorFactory Spec" do + it "returns UnitMonitor for create_unit_monitor" do + #Arrange + factory = MonitorFactory.new() + monitor_record = HealthMonitorRecord.new(:monitor_id, :monitor_instance_id, :time, :pass, {}, {}, {}) + #act + monitor = factory.create_unit_monitor(monitor_record) + # assert + monitor.must_be_kind_of(UnitMonitor) + end + + it "returns AggregateMonitor for create_aggregate_monitor" do + #arrange + factory = MonitorFactory.new() + mock = Minitest::Mock.new + def mock.state; :pass; end + def mock.transition_date_time; :time; end + #act + monitor = factory.create_aggregate_monitor(:monitor_id, :monitor_instance_id, :pass, {}, {}, mock) + #assert + monitor.must_be_kind_of(AggregateMonitor) + end +end \ No newline at end of file diff --git a/test/code/plugin/health/monitor_set_spec.rb b/test/code/plugin/health/monitor_set_spec.rb new file mode 100644 index 000000000..1f4e970be --- /dev/null +++ b/test/code/plugin/health/monitor_set_spec.rb @@ -0,0 +1,58 @@ +require_relative '../test_helpers' +# consider doing this in test_helpers.rb so that this code is common +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe "MonitorSet Spec" do + it "add_or_update -- adds a monitor" do + #arrange + set = MonitorSet.new + mock_monitor = MiniTest::Mock.new + def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end + def mock_monitor.state; :pass;end + #act + set.add_or_update(mock_monitor) + #assert + assert_equal set.get_map.size, 1 + assert_equal set.get_map.key?("monitor_instance_id_1"), true + end + + it "add_or_update -- updates a monitor" do + #arrange + set = MonitorSet.new + mock_monitor = MiniTest::Mock.new + def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end + def mock_monitor.state; :pass;end + #act + set.add_or_update(mock_monitor) + #assert + assert_equal set.get_map["monitor_instance_id_1"].state, :pass + + #act + def mock_monitor.state; :fail;end + set.add_or_update(mock_monitor) + #assert + assert_equal set.get_map["monitor_instance_id_1"].state, :fail + end + + it "delete -- delete a monitor" do + #arrange + set = MonitorSet.new + mock_monitor = MiniTest::Mock.new + def mock_monitor.monitor_instance_id; "monitor_instance_id_1"; end + def mock_monitor.state; :pass;end + set.add_or_update(mock_monitor) + + #act + set.delete("monitor_instance_id_1") + #assert + assert_equal set.get_map.size, 0 + end + + it "get_map -- returns a hash" do + #arrange + set = MonitorSet.new + #act and assert + set.get_map.must_be_kind_of(Hash) + end +end diff --git a/test/code/plugin/health/parent_monitor_provider_spec.rb b/test/code/plugin/health/parent_monitor_provider_spec.rb new file mode 100644 index 000000000..a83db50fc --- /dev/null +++ b/test/code/plugin/health/parent_monitor_provider_spec.rb @@ -0,0 +1,144 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "ParentMonitorProvider spec" do + it 'returns correct parent_monitor_id for a non-condition case' do + #arrange + definition = JSON.parse('{ + "monitor_id" : { + "parent_monitor_id": "parent_monitor_id", + "labels": [ + "label_1", + "label_2" + ] + } + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "monitor_id"; end + def monitor.monitor_instance_id; "monitor_instance_id"; end + + #act + parent_id = health_model_definition.get_parent_monitor_id(monitor) + #assert + assert_equal parent_id, "parent_monitor_id" + end + + it 'returns raises for an incorrect monitor id' do + #arrange + definition = JSON.parse('{ + "monitor_id" : { + "parent_monitor_id": "parent_monitor_id", + "labels": [ + "label_1", + "label_2" + ] + } + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "monitor_id_!"; end + def monitor.monitor_instance_id; "monitor_instance_id"; end + + #act and assert + assert_raises do + parent_id = health_model_definition.get_parent_monitor_id(monitor) + end + end + + it 'returns correct parent_monitor_id for a conditional case' do + #arrange + definition = JSON.parse('{"conditional_monitor_id": { + "conditions": [ + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "master", + "parent_id": "master_node_pool" + }, + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "parent_id": "agent_node_pool" + } + ], + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "aggregation_algorithm": "worstOf", + "aggregation_algorithm_params": null + } + + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "conditional_monitor_id"; end + def monitor.monitor_instance_id; "conditional_monitor_instance_id"; end + def monitor.labels; {HealthMonitorLabels::ROLE => "master"}; end + + #act + parent_id = health_model_definition.get_parent_monitor_id(monitor) + #assert + assert_equal parent_id, "master_node_pool" + end + + it 'raises if conditions are not met' do + #arrange + definition = JSON.parse('{"conditional_monitor_id": { + "conditions": [ + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "master", + "parent_id": "master_node_pool" + }, + { + "key": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "parent_id": "agent_node_pool" + } + ], + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "aggregation_algorithm": "worstOf", + "aggregation_algorithm_params": null + } + + }' + ) + health_model_definition = ParentMonitorProvider.new(definition) + + monitor = Mock.new + def monitor.monitor_id; "conditional_monitor_id"; end + def monitor.monitor_instance_id; "conditional_monitor_instance_id"; end + def monitor.labels; {HealthMonitorLabels::ROLE => "master1"}; end + + #act and assert + assert_raises do + parent_id = health_model_definition.get_parent_monitor_id(monitor) + end + end +end diff --git a/test/code/plugin/health/test_health_model_definition.json b/test/code/plugin/health/test_health_model_definition.json new file mode 100644 index 000000000..31d219705 --- /dev/null +++ b/test/code/plugin/health/test_health_model_definition.json @@ -0,0 +1,42 @@ +[ + { + "monitor_id": "monitor_id", + "parent_monitor_id": "parent_monitor_id", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "conditional_monitor_id", + "aggregation_algorithm": "worstOf", + "labels": [ + "kubernetes.io/hostname", + "agentpool", + "kubernetes.io/role", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": [ + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "master", + "id": "master_node_pool" + }, + { + "label": "kubernetes.io/role", + "operator": "==", + "value": "agent", + "id": "agent_node_pool" + } + ] + } +] \ No newline at end of file diff --git a/test/code/plugin/health/unit_monitor_spec.rb b/test/code/plugin/health/unit_monitor_spec.rb new file mode 100644 index 000000000..4cbf794db --- /dev/null +++ b/test/code/plugin/health/unit_monitor_spec.rb @@ -0,0 +1,20 @@ +require_relative '../../../../source/code/plugin/health/unit_monitor' +require_relative '../test_helpers' + +include HealthModel + +describe "UnitMonitor Spec" do + it "is_aggregate_monitor is false for UnitMonitor" do + # Arrange/Act + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + # Assert + assert_equal monitor.is_aggregate_monitor, false + end + + it "get_member_monitors is nil for UnitMonitor" do + # Arrange/Act + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + #Assert + assert_nil monitor.get_member_monitors + end +end \ No newline at end of file diff --git a/test/code/plugin/health/unit_monitor_test.rb b/test/code/plugin/health/unit_monitor_test.rb new file mode 100644 index 000000000..e53617c99 --- /dev/null +++ b/test/code/plugin/health/unit_monitor_test.rb @@ -0,0 +1,16 @@ +require_relative '../../../../source/code/plugin/health/unit_monitor' +require_relative '../test_helpers' + +class UnitMonitorTest < Minitest::Test + include HealthModel + + def test_is_aggregate_monitor_false + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + assert_equal monitor.is_aggregate_monitor, false + end + + def test_get_member_monitors_nil + monitor = UnitMonitor.new(:monitor_id, :monitor_instance_id, :pass, :time, {}, {}, {}) + assert_nil monitor.get_member_monitors + end +end diff --git a/test/code/plugin/test_helpers.rb b/test/code/plugin/test_helpers.rb new file mode 100644 index 000000000..543f00ac9 --- /dev/null +++ b/test/code/plugin/test_helpers.rb @@ -0,0 +1,3 @@ +gem "minitest" +require "minitest/spec" +require 'minitest/autorun' \ No newline at end of file From 4adcd8bd70f98260e3b6d2b3e5780cbb1d5c71ec Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 15 Aug 2019 19:03:16 -0700 Subject: [PATCH 111/160] Fix Deserialization Bug (#249) --- source/code/plugin/health/cluster_health_state.rb | 8 ++++++-- source/code/plugin/health/health_monitor_state.rb | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/source/code/plugin/health/cluster_health_state.rb b/source/code/plugin/health/cluster_health_state.rb index ac7e05675..3b56dd243 100644 --- a/source/code/plugin/health/cluster_health_state.rb +++ b/source/code/plugin/health/cluster_health_state.rb @@ -16,8 +16,12 @@ def initialize(token_file_path, cert_file_path) @token = get_token end - def update_state(state) + def update_state(state) #state = hash of monitor_instance_id to HealthMonitorInstanceState struct get_request = Net::HTTP::Get.new(@uri.request_uri) + monitor_states_hash = {} + state.each {|monitor_instance_id, health_monitor_instance_state| + monitor_states_hash[monitor_instance_id] = health_monitor_instance_state.to_h + } get_request["Authorization"] = "Bearer #{@token}" @log.info "Making GET request to #{@uri.request_uri} @ #{Time.now.utc.iso8601}" @@ -37,7 +41,7 @@ def update_state(state) update_request["Authorization"] = "Bearer #{@token}" update_request_body = get_update_request_body - update_request_body["state"] = state.to_json + update_request_body["state"] = monitor_states_hash.to_json update_request.body = update_request_body.to_json update_response = @http_client.request(update_request) diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb index c3df5e3a9..e6205b481 100644 --- a/source/code/plugin/health/health_monitor_state.rb +++ b/source/code/plugin/health/health_monitor_state.rb @@ -33,7 +33,7 @@ def to_h def initialize_state(deserialized_state) @@monitor_states = {} deserialized_state.each{|k,v| - health_monitor_instance_state_hash = JSON.parse(v) + health_monitor_instance_state_hash = v state = HealthMonitorInstanceState.new(*health_monitor_instance_state_hash.values_at(*HealthMonitorInstanceState.members)) state.prev_sent_record_time = health_monitor_instance_state_hash["prev_sent_record_time"] state.old_state = health_monitor_instance_state_hash["old_state"] From 2ee43076e43e117c8376d576e6aa5ee783a57bcc Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 16 Aug 2019 09:47:02 -0700 Subject: [PATCH 112/160] Fix the bug where capacity is not updated and cached value was being used (#251) * Fix the Capacity computation * fix node cpu and memory limits calculation --- .../plugin/health/health_monitor_utils.rb | 40 +++++++++---------- source/code/plugin/in_kube_health.rb | 13 +++--- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index df47529e6..e9d59941e 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -172,35 +172,33 @@ def get_resource_subscription(pod_inventory, metric_name, metric_capacity) return subscription end - def get_cluster_cpu_memory_capacity(log) + def get_cluster_cpu_memory_capacity(log, node_inventory: nil) begin - node_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + if node_inventory.nil? + node_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + end cluster_cpu_capacity = 0.0 cluster_memory_capacity = 0.0 if !node_inventory.empty? - node_inventory['items'].each do |node| - cpu_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "cpu", "cpuCapacityNanoCores") - if !cpu_capacity_json.nil? - cpu_capacity_json.each do |cpu_capacity_node| - if !cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - cluster_cpu_capacity += cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'] - end + cpu_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "cpu", "cpuCapacityNanoCores") + if !cpu_capacity_json.nil? + cpu_capacity_json.each do |cpu_capacity_node| + if !cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + cluster_cpu_capacity += cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'] end - log.info "Cluster CPU Limit #{cluster_cpu_capacity}" - else - log.info "Error getting cpu_capacity" end - memory_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "memory", "memoryCapacityBytes") - if !memory_capacity_json.nil? - memory_capacity_json.each do |memory_capacity_node| - if !memory_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - cluster_memory_capacity += memory_capacity_node['DataItems'][0]['Collections'][0]['Value'] - end + else + log.info "Error getting cpu_capacity" + end + memory_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "memory", "memoryCapacityBytes") + if !memory_capacity_json.nil? + memory_capacity_json.each do |memory_capacity_node| + if !memory_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + cluster_memory_capacity += memory_capacity_node['DataItems'][0]['Collections'][0]['Value'] end - log.info "Cluster Memory Limit #{cluster_memory_capacity}" - else - log.info "Error getting memory_capacity" end + else + log.info "Error getting memory_capacity" end else log.info "Unable to get cpu and memory capacity" diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index d9672da3b..045ddf7c7 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -104,9 +104,9 @@ def enumerate end if !pod_inventory.nil? - record = process_cpu_oversubscribed_monitor(pod_inventory) + record = process_cpu_oversubscribed_monitor(pod_inventory, node_inventory) health_monitor_records.push(record) if record - record = process_memory_oversubscribed_monitor(pod_inventory) + record = process_memory_oversubscribed_monitor(pod_inventory, node_inventory) health_monitor_records.push(record) if record pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(pod_inventory, deployment_inventory) @@ -146,11 +146,12 @@ def enumerate end end - def process_cpu_oversubscribed_monitor(pod_inventory) + def process_cpu_oversubscribed_monitor(pod_inventory, node_inventory) timestamp = Time.now.utc.iso8601 + @@clusterCpuCapacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog, node_inventory: node_inventory)[0] subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"cpu", @@clusterCpuCapacity) + @@hmlog.info "Refreshed Cluster CPU Capacity #{@@clusterCpuCapacity}" state = subscription > @@clusterCpuCapacity ? "fail" : "pass" - #@@hmlog.debug "CPU Oversubscribed Monitor State : #{state}" #CPU monitor_id = HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID @@ -171,8 +172,10 @@ def process_cpu_oversubscribed_monitor(pod_inventory) return health_record end - def process_memory_oversubscribed_monitor(pod_inventory) + def process_memory_oversubscribed_monitor(pod_inventory, node_inventory) timestamp = Time.now.utc.iso8601 + @@clusterMemoryCapacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog,node_inventory: node_inventory)[1] + @@hmlog.info "Refreshed Cluster Memory Capacity #{@@clusterMemoryCapacity}" subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"memory", @@clusterMemoryCapacity) state = subscription > @@clusterMemoryCapacity ? "fail" : "pass" #@@hmlog.debug "Memory Oversubscribed Monitor State : #{state}" From e86f82f4aa0587532201d559d99bce537cb6e837 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 16 Aug 2019 13:58:08 -0700 Subject: [PATCH 113/160] changes (#250) --- source/code/plugin/DockerApiClient.rb | 44 ++++++++++++--------- source/code/plugin/in_containerinventory.rb | 3 +- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index eb9d74531..ee2742dd4 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -44,7 +44,11 @@ def getResponse(request, isMultiJson, isVersion) return (isTimeOut) ? nil : parseResponse(dockerResponse, isMultiJson) rescue => errorStr $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + # Adding this check to avoid an infinite loop for the docker info call in exception telemetry + if !request.include? "GET /version " + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return nil end end @@ -80,28 +84,32 @@ def getDockerHostName() def listContainers() ids = [] - request = DockerApiRestHelper.restDockerPs - containers = getResponse(request, true, false) - if !containers.nil? && !containers.empty? - containers.each do |container| - labels = (!container["Labels"].nil?) ? container["Labels"] : container["labels"] - if !labels.nil? - labelKeys = labels.keys - dockerTypeLabel = labelKeys.find { |k| "io.kubernetes.docker.type".downcase == k.downcase } - if !dockerTypeLabel.nil? - dockerTypeLabelValue = labels[dockerTypeLabel] - # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers - if !(dockerTypeLabelValue.downcase == "podsandbox".downcase) - # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that - # are created in the pods for ContainerInventory - keyValue = labelKeys.find { |k| "io.kubernetes.pod.uid".downcase == k.downcase } - if !labels[keyValue].nil? - ids.push(container["Id"]) + begin + request = DockerApiRestHelper.restDockerPs + containers = getResponse(request, true, false) + if !containers.nil? && !containers.empty? + containers.each do |container| + labels = (!container["Labels"].nil?) ? container["Labels"] : container["labels"] + if !labels.nil? + labelKeys = labels.keys + dockerTypeLabel = labelKeys.find { |k| "io.kubernetes.docker.type".downcase == k.downcase } + if !dockerTypeLabel.nil? + dockerTypeLabelValue = labels[dockerTypeLabel] + # Checking for 'io.kubernetes.docker.type' label for docker containers to exclude the pause-amd64 containers + if !(dockerTypeLabelValue.downcase == "podsandbox".downcase) + # Case insensitive lookup for pod uid label - This is to exclude containers created using docker run and only include containers that + # are created in the pods for ContainerInventory + keyValue = labelKeys.find { |k| "io.kubernetes.pod.uid".downcase == k.downcase } + if !labels[keyValue].nil? + ids.push(container["Id"]) + end end end end end end + rescue => errorStr + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return ids end diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index 4392de280..ccf61ab2e 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -198,7 +198,7 @@ def enumerate hostname = DockerApiClient.getDockerHostName begin containerIds = DockerApiClient.listContainers - if !containerIds.empty? + if !containerIds.nil? && !containerIds.empty? eventStream = MultiEventStream.new nameMap = DockerApiClient.getImageIdMap clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @@ -252,6 +252,7 @@ def enumerate end rescue => errorStr $log.warn("Exception in enumerate container inventory: #{errorStr}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end From c76ce47887cd7ac155c1651ebf8db233805481cf Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 16 Aug 2019 14:32:36 -0700 Subject: [PATCH 114/160] Added new Custom Metrics Regions, fixed MDM plugin crash bug (#253) Added new regions, added handler for MDM plugin start --- installer/conf/container.conf | 2 +- installer/conf/kube.conf | 4 +-- source/code/plugin/out_mdm.rb | 53 ++++++++++++++++++----------------- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 6d810a0e2..4cb9e6913 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -32,7 +32,7 @@ #custom_metrics_mdm filter plugin type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes log_level info diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 4b4ec09ea..3cbc3ff17 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -70,14 +70,14 @@ type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 69ef25580..4b9d50a29 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -44,37 +44,38 @@ def start super begin file = File.read(@@azure_json_path) + @data_hash = JSON.parse(file) + aks_resource_id = ENV["AKS_RESOURCE_ID"] + aks_region = ENV["AKS_REGION"] + + if aks_resource_id.to_s.empty? + @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " + @can_send_data_to_mdm = false + end + if aks_region.to_s.empty? + @log.info "Environment Variable AKS_REGION is not set.. " + @can_send_data_to_mdm = false + end + aks_region = aks_region.gsub(" ","") + + if @can_send_data_to_mdm + @log.info "MDM Metrics supported in #{aks_region} region" + @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} + @cached_access_token = get_access_token + @@post_request_url = @@post_request_url_template % {aks_region: aks_region, aks_resource_id: aks_resource_id} + @post_request_uri = URI.parse(@@post_request_url) + @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) + @http_client.use_ssl = true + @log.info "POST Request url: #{@@post_request_url}" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) + end rescue => e - @log.info "Unable to read file #{@@azure_json_path} #{e}" + @log.info "exception when initializing out_mdm #{e}" + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "MDM"}) @can_send_data_to_mdm = false return end - # Handle the case where the file read fails. Send Telemetry and exit the plugin? - @data_hash = JSON.parse(file) - @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} - @cached_access_token = get_access_token - aks_resource_id = ENV["AKS_RESOURCE_ID"] - aks_region = ENV["AKS_REGION"] - - if aks_resource_id.to_s.empty? - @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " - @can_send_data_to_mdm = false - return - end - if aks_region.to_s.empty? - @log.info "Environment Variable AKS_REGION is not set.. " - @can_send_data_to_mdm = false - return - end - - aks_region = aks_region.gsub(" ","") - @@post_request_url = @@post_request_url_template % {aks_region: aks_region, aks_resource_id: aks_resource_id} - @post_request_uri = URI.parse(@@post_request_url) - @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) - @http_client.use_ssl = true - @log.info "POST Request url: #{@@post_request_url}" - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) end # get the access token only if the time to expiry is less than 5 minutes From 10a79c8c5546fcbcf21532594b6d25f4e269e76b Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 19 Aug 2019 12:54:53 -0700 Subject: [PATCH 115/160] Add Missing Handlers (#254) * Added Missing Handlers --- .../plugin/filter_cadvisor_health_node.rb | 32 ++++++++++++------- .../plugin/health/health_monitor_utils.rb | 2 +- source/code/plugin/in_kube_health.rb | 20 +++++++----- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb index 627a525e7..ce57c2c62 100644 --- a/source/code/plugin/filter_cadvisor_health_node.rb +++ b/source/code/plugin/filter_cadvisor_health_node.rb @@ -47,21 +47,29 @@ def initialize end def configure(conf) - super - @log = HealthMonitorUtils.get_log_handle - @log.debug {'Starting filter_cadvisor2health plugin'} + begin + super + @log = HealthMonitorUtils.get_log_handle + @log.debug {'Starting filter_cadvisor2health plugin'} + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end end def start - super - @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) - @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" - node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) - @cpu_capacity = node_capacity[0] - @memory_capacity = node_capacity[1] - @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}" - #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName) - ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {}) + begin + super + @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) + @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" + node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) + @cpu_capacity = node_capacity[0] + @memory_capacity = node_capacity[1] + @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}" + #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName) + ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {}) + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end end def filter_stream(tag, es) diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index e9d59941e..b1c77a4a1 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -13,7 +13,7 @@ class HealthMonitorUtils $log.info "Error loading KubernetesApiClient #{e.message}" end - @@node_inventory = [] + @@nodeInventory = {} @log_path = "/var/opt/microsoft/docker-cimprov/log/health_monitors.log" diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 045ddf7c7..199e03e56 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -18,14 +18,18 @@ class KubeHealthInput < Input @@clusterMemoryCapacity = 0.0 def initialize - super - require "yaml" - require "json" - - @@cluster_id = KubernetesApiClient.getClusterId - @resources = HealthKubernetesResources.instance - @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) - @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + begin + super + require "yaml" + require "json" + + @@cluster_id = KubernetesApiClient.getClusterId + @resources = HealthKubernetesResources.instance + @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end end include HealthModel From 851ab4ec7a2062a85c5f58e1642971d8580f16aa Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 21 Aug 2019 14:03:52 -0700 Subject: [PATCH 116/160] Return MultiEventStream.new instead of empty array (#256) --- source/code/plugin/filter_cadvisor_health_node.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb index ce57c2c62..faa574993 100644 --- a/source/code/plugin/filter_cadvisor_health_node.rb +++ b/source/code/plugin/filter_cadvisor_health_node.rb @@ -75,7 +75,7 @@ def start def filter_stream(tag, es) if !@@cluster_health_model_enabled @log.info "Cluster Health Model disabled in filter_cadvisor_health_node" - return [] + return MultiEventStream.new end new_es = MultiEventStream.new #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName) From f20debb244c5ec2b9eba23e0588520dbe7a4490b Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 22 Aug 2019 17:58:17 -0700 Subject: [PATCH 117/160] Added explicit require_relative to avoid loading errors (#258) * Adding explicit require_relative --- source/code/plugin/health/agg_monitor_id_labels.rb | 2 ++ source/code/plugin/health/health_kube_api_down_handler.rb | 1 + source/code/plugin/health/health_kubernetes_resources.rb | 1 + source/code/plugin/health/health_missing_signal_generator.rb | 3 +++ source/code/plugin/health/health_model_builder.rb | 1 - source/code/plugin/health/health_monitor_helpers.rb | 1 + source/code/plugin/health/health_monitor_provider.rb | 2 ++ source/code/plugin/health/health_monitor_state.rb | 2 ++ source/code/plugin/health/health_monitor_utils.rb | 1 + source/code/plugin/health/health_signal_reducer.rb | 2 ++ source/code/plugin/health/monitor_factory.rb | 3 +++ source/code/plugin/health/node_monitor_hierarchy_reducer.rb | 1 + source/code/plugin/health/parent_monitor_provider.rb | 1 + source/code/plugin/health/unit_monitor.rb | 1 - 14 files changed, 20 insertions(+), 2 deletions(-) diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb index 48ca46184..86a3381cd 100644 --- a/source/code/plugin/health/agg_monitor_id_labels.rb +++ b/source/code/plugin/health/agg_monitor_id_labels.rb @@ -1,3 +1,5 @@ +require_relative 'health_model_constants' + module HealthModel class AggregateMonitorInstanceIdLabels @@id_labels_mapping = { diff --git a/source/code/plugin/health/health_kube_api_down_handler.rb b/source/code/plugin/health/health_kube_api_down_handler.rb index 7f7ba1bd3..7f72360f8 100644 --- a/source/code/plugin/health/health_kube_api_down_handler.rb +++ b/source/code/plugin/health/health_kube_api_down_handler.rb @@ -1,3 +1,4 @@ +require_relative 'health_model_constants' module HealthModel class HealthKubeApiDownHandler def initialize diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb index 53f879bf5..2f591722b 100644 --- a/source/code/plugin/health/health_kubernetes_resources.rb +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -1,4 +1,5 @@ require 'singleton' +require_relative 'health_model_constants' module HealthModel class HealthKubernetesResources diff --git a/source/code/plugin/health/health_missing_signal_generator.rb b/source/code/plugin/health/health_missing_signal_generator.rb index ff7f6a390..419680afa 100644 --- a/source/code/plugin/health/health_missing_signal_generator.rb +++ b/source/code/plugin/health/health_missing_signal_generator.rb @@ -1,3 +1,6 @@ +require_relative 'health_model_constants' +require_relative 'health_monitor_record' + module HealthModel class HealthMissingSignalGenerator attr_accessor :last_received_records, :current_received_records diff --git a/source/code/plugin/health/health_model_builder.rb b/source/code/plugin/health/health_model_builder.rb index 4cf802798..13813c8d9 100644 --- a/source/code/plugin/health/health_model_builder.rb +++ b/source/code/plugin/health/health_model_builder.rb @@ -1,4 +1,3 @@ -require_relative 'health_model_constants' require 'time' module HealthModel diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb index 9e2977a0e..9f0315978 100644 --- a/source/code/plugin/health/health_monitor_helpers.rb +++ b/source/code/plugin/health/health_monitor_helpers.rb @@ -1,5 +1,6 @@ require 'logger' require 'digest' +require_relative 'health_model_constants' module HealthModel # static class that provides a bunch of utility methods diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb index 0c1cbf7f2..60ad69d76 100644 --- a/source/code/plugin/health/health_monitor_provider.rb +++ b/source/code/plugin/health/health_monitor_provider.rb @@ -1,3 +1,5 @@ +require_relative 'health_model_constants' + module HealthModel class HealthMonitorProvider diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb index e6205b481..498c75ec7 100644 --- a/source/code/plugin/health/health_monitor_state.rb +++ b/source/code/plugin/health/health_monitor_state.rb @@ -1,3 +1,5 @@ +require_relative 'health_model_constants' + module HealthModel HealthMonitorInstanceState = Struct.new(:prev_sent_record_time, :old_state, :new_state, :state_change_time, :prev_records, :is_state_change_consistent, :should_send) do diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index b1c77a4a1..e707651dc 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -1,5 +1,6 @@ require 'logger' require 'digest' +require_relative 'health_model_constants' module HealthModel # static class that provides a bunch of utility methods diff --git a/source/code/plugin/health/health_signal_reducer.rb b/source/code/plugin/health/health_signal_reducer.rb index 4cf53e82c..1d520da8d 100644 --- a/source/code/plugin/health/health_signal_reducer.rb +++ b/source/code/plugin/health/health_signal_reducer.rb @@ -1,3 +1,5 @@ +require_relative 'health_model_constants' + module HealthModel # this class # 1. dedupes daemonset signals and takes only the latest diff --git a/source/code/plugin/health/monitor_factory.rb b/source/code/plugin/health/monitor_factory.rb index e6ec9d2c3..5f2c3945c 100644 --- a/source/code/plugin/health/monitor_factory.rb +++ b/source/code/plugin/health/monitor_factory.rb @@ -1,3 +1,6 @@ +require_relative 'aggregate_monitor' +require_relative 'unit_monitor' + module HealthModel class MonitorFactory diff --git a/source/code/plugin/health/node_monitor_hierarchy_reducer.rb b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb index aafbd07a8..0bad4517e 100644 --- a/source/code/plugin/health/node_monitor_hierarchy_reducer.rb +++ b/source/code/plugin/health/node_monitor_hierarchy_reducer.rb @@ -1,4 +1,5 @@ # frozen_string_literal: true +require_relative 'health_model_constants' module HealthModel class NodeMonitorHierarchyReducer diff --git a/source/code/plugin/health/parent_monitor_provider.rb b/source/code/plugin/health/parent_monitor_provider.rb index 6a27f11d8..4577abb99 100644 --- a/source/code/plugin/health/parent_monitor_provider.rb +++ b/source/code/plugin/health/parent_monitor_provider.rb @@ -1,3 +1,4 @@ +require_relative 'health_model_constants' module HealthModel class ParentMonitorProvider diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb index 9af599321..64262aa2e 100644 --- a/source/code/plugin/health/unit_monitor.rb +++ b/source/code/plugin/health/unit_monitor.rb @@ -1,4 +1,3 @@ -require_relative 'health_model_constants' require 'json' module HealthModel From a8804df7c0ccc645dc8f51ea8fbf1f9431c13957 Mon Sep 17 00:00:00 2001 From: ganga1980 Date: Wed, 28 Aug 2019 11:38:36 -0700 Subject: [PATCH 118/160] Gangams/enable ai telemetry in mc (#252) * enable ai telemetry to configure different ikey and endpoint per cloud --- source/code/go/src/plugins/telemetry.go | 11 ++++++++++- source/code/plugin/ApplicationInsightsUtility.rb | 13 ++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 5fc0fa843..4f22b8c03 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -42,6 +42,7 @@ const ( envAKSResourceID = "AKS_RESOURCE_ID" envACSResourceName = "ACS_RESOURCE_NAME" envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" metricNameLogSize = "ContainerLogsSize" @@ -141,7 +142,15 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { return -1, err } - TelemetryClient = appinsights.NewTelemetryClient(string(decIkey)) + appInsightsEndpoint := os.Getenv(envAppInsightsEndpoint) + telemetryClientConfig := appinsights.NewTelemetryConfiguration(string(decIkey)) + // endpoint override required only for sovereign clouds + if appInsightsEndpoint != "" { + Log("Overriding the default AppInsights EndpointUrl with %s", appInsightsEndpoint) + telemetryClientConfig.EndpointUrl = envAppInsightsEndpoint + } + TelemetryClient = appinsights.NewTelemetryClientFromConfig(telemetryClientConfig) + telemetryOffSwitch := os.Getenv("DISABLE_TELEMETRY") if strings.Compare(strings.ToLower(telemetryOffSwitch), "true") == 0 { Log("Appinsights telemetry is disabled \n") diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 5dc2bfab8..bb4831701 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -18,6 +18,7 @@ class ApplicationInsightsUtility @@EnvAksRegion = "AKS_REGION" @@EnvAgentVersion = "AGENT_VERSION" @@EnvApplicationInsightsKey = "APPLICATIONINSIGHTS_AUTH" + @@EnvApplicationInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" @@EnvControllerType = "CONTROLLER_TYPE" @@CustomProperties = {} @@ -62,6 +63,7 @@ def initializeUtility() @@CustomProperties["AgentVersion"] = ENV[@@EnvAgentVersion] @@CustomProperties["ControllerType"] = ENV[@@EnvControllerType] encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] + appInsightsEndpoint = ENV[@@EnvApplicationInsightsEndpoint] #Check if telemetry is turned off telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] @@ -70,7 +72,16 @@ def initializeUtility() @@Tc = ApplicationInsights::TelemetryClient.new elsif !encodedAppInsightsKey.nil? decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) - @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey + #override ai endpoint if its available otherwise use default. + if appInsightsEndpoint && !appInsightsEndpoint.nil? && !appInsightsEndpoint.empty? + $log.info("AppInsightsUtility: Telemetry client uses overrided endpoint url : #{appInsightsEndpoint}") + telemetrySynchronousSender = ApplicationInsights::Channel::SynchronousSender.new appInsightsEndpoint + telemetrySynchronousQueue = ApplicationInsights::Channel::SynchronousQueue.new(telemetrySynchronousSender) + telemetryChannel = ApplicationInsights::Channel::TelemetryChannel.new nil, telemetrySynchronousQueue + @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey, telemetryChannel + else + @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey + end end rescue => errorStr $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") From 8a5ebb037025fcf9576c9e3f92bc5614638ec548 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 10 Sep 2019 15:18:45 -0700 Subject: [PATCH 119/160] Fixing null check out_mdm bug, tomlparser bug, exposing Replica Set service name as an ENV variable (#261) * Expose replica set service as an env variable * Fixing null check out_mdm bug, and tomlparser bug * Updating the env variable name to be more specific to health model --- installer/conf/container.conf | 2 +- installer/scripts/tomlparser.rb | 11 ++++++++--- source/code/plugin/out_mdm.rb | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 4cb9e6913..0b26357f0 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -76,7 +76,7 @@ heartbeat_type tcp - host healthmodel-replicaset-service.kube-system + host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_ENDPOINT']}" port 25227 diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index 067586629..b66e1257e 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -16,6 +16,7 @@ @logTailPath = "/var/log/containers/*.log" @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" @excludePath = "*.csv2" #some invalid path +@enable_health_model = false # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap(path) @@ -121,10 +122,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) end begin - if !parsedConfig.nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] - puts "enable_health_model = #{@enable_health_model}" + else + @enable_health_model = false end + puts "enable_health_model = #{@enable_health_model}" rescue => errorStr puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" @enable_health_model = false @@ -140,7 +143,9 @@ def populateSettingValuesFromConfigMap(parsedConfig) Dir["/etc/config/settings/*settings"].each{|file| puts "Parsing File #{file}" settings = parseConfigMap(file) - configMapSettings = configMapSettings.merge(settings) + if !settings.nil? + configMapSettings = configMapSettings.merge(settings) + end } if !configMapSettings.nil? diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 4b9d50a29..b8d10090d 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -55,8 +55,9 @@ def start if aks_region.to_s.empty? @log.info "Environment Variable AKS_REGION is not set.. " @can_send_data_to_mdm = false + else + aks_region = aks_region.gsub(" ","") end - aks_region = aks_region.gsub(" ","") if @can_send_data_to_mdm @log.info "MDM Metrics supported in #{aks_region} region" From a939bf796ce2bc420d4862399d1312aa5e572e9e Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 10 Sep 2019 17:18:48 -0700 Subject: [PATCH 120/160] Changes for creating custom plugins with namespace settings for prometheus scraping (#262) * changes * changes * changes * changes * changes * changes * chnages * changes * telemetry changes * changes --- installer/conf/telegraf-rs.conf | 3 +- .../scripts/tomlparser-prom-customconfig.rb | 76 +++++++++++++++++-- source/code/plugin/in_kube_nodes.rb | 2 + 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index ce60bfa04..3450ab88f 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -552,7 +552,7 @@ ## set this to `https` & most likely set the tls config. ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. ## - prometheus.io/port: If port is not 9102 use this annotation - monitor_kubernetes_pods = $AZMON_RS_PROM_MONITOR_PODS + $AZMON_RS_PROM_MONITOR_PODS fieldpass = $AZMON_RS_PROM_FIELDPASS fielddrop = $AZMON_RS_PROM_FIELDDROP @@ -579,6 +579,7 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] +$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER # [[inputs.exec]] # ## Commands array # interval = "15m" diff --git a/installer/scripts/tomlparser-prom-customconfig.rb b/installer/scripts/tomlparser-prom-customconfig.rb index d9fdf1cc2..d44bf3342 100644 --- a/installer/scripts/tomlparser-prom-customconfig.rb +++ b/installer/scripts/tomlparser-prom-customconfig.rb @@ -18,6 +18,14 @@ @defaultRsK8sServices = [] @defaultRsMonitorPods = false +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap begin @@ -53,6 +61,48 @@ def checkForType(variable, varType) end end +def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) + begin + new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) + new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") + rescue => errorStr + puts "config::error::Exception while replacing default pod monitor settings: #{errorStr}" + end + return new_contents +end + +def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) + begin + new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_RS_PROM_MONITOR_PODS") + pluginConfigsWithNamespaces = "" + monitorKubernetesPodsNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + interval = \"#{interval}\" + monitor_kubernetes_pods = true + monitor_kubernetes_pods_namespace = \"#{namespace}\" + fieldpass = #{fieldPassSetting} + fielddrop = #{fieldDropSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) + return new_contents + rescue => errorStr + puts "config::error::Exception while creating prometheus input plugins to filter namespaces: #{errorStr}, using defaults" + replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) + end +end + # Use the ruby structure created after config parsing to set the right values to be used as environment variables def populateSettingValuesFromConfigMap(parsedConfig) # Checking to see if this is the daemonset or replicaset to parse config accordingly @@ -68,6 +118,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] # Check for the right datattypes to enforce right setting values if checkForType(interval, String) && @@ -75,7 +126,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) checkForTypeArray(fieldDrop, String) && checkForTypeArray(kubernetesServices, String) && checkForTypeArray(urls, String) && - !monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby puts "config::Successfully passed typecheck for config settings for replicaset" #if setting is nil assign default values interval = (interval.nil?) ? @defaultRsInterval : interval @@ -83,7 +134,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices urls = (urls.nil?) ? @defaultRsPromUrls : urls - monitorKubernetesPods = (kubernetesServices.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods file_name = "/opt/telegraf-test-rs.conf" # Copy the telegraf config file to a temp file to run telegraf in test mode with this config @@ -93,11 +144,24 @@ def populateSettingValuesFromConfigMap(parsedConfig) #Replace the placeholder config values with values from custom config text = File.read(file_name) new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", fieldDropSetting) new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", (monitorKubernetesPods ? "true" : "false")) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) + monitorKubernetesPodsNamespacesLength = 0 + end + File.open(file_name, "w") { |file| file.puts new_contents } puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" #Set environment variables for telemetry @@ -110,6 +174,8 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + # Close file after writing all environment variables file.close puts "config::Successfully created telemetry file for replicaset" diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 24ab51d4c..7249957ab 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -15,6 +15,7 @@ class Kube_nodeInventory_Input < Input @@rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] + @@rsPromMonitorPodsNamespaceLength = ENV["TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH"] def initialize super @@ -150,6 +151,7 @@ def enumerate properties["rsPromServ"] = @@rsPromK8sServiceCount properties["rsPromUrl"] = @@rsPromUrlCount properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength end ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true From 2a072332b105ddb57cfd77cbebd67e9ec7a728fa Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 11 Sep 2019 22:32:09 -0700 Subject: [PATCH 121/160] Cherry-pick hotfix 09092019 to ci_feature (#265) --- installer/conf/container-health.conf | 103 ++++++++++++++++++++++++ installer/conf/container.conf | 25 ------ installer/datafiles/base_container.data | 5 ++ 3 files changed, 108 insertions(+), 25 deletions(-) create mode 100644 installer/conf/container-health.conf diff --git a/installer/conf/container-health.conf b/installer/conf/container-health.conf new file mode 100644 index 000000000..4cb9e6913 --- /dev/null +++ b/installer/conf/container-health.conf @@ -0,0 +1,103 @@ +# Fluentd config file for OMS Docker - container components (non kubeAPI) + +# Forward port 25225 for container logs + + type forward + port 25225 + bind 127.0.0.1 + + +# Container inventory + + type containerinventory + tag oms.containerinsights.containerinventory + run_interval 60s + log_level debug + + +#cadvisor perf + + type cadvisorperf + tag oms.api.cadvisorperf + run_interval 60s + log_level debug + + + + type filter_cadvisor_health_node + log_level debug + + + +#custom_metrics_mdm filter plugin + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + log_level info + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_cadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + + @type forward + send_timeout 60s + recover_wait 10s + hard_timeout 60s + heartbeat_type tcp + + + host healthmodel-replicaset-service.kube-system + port 25227 + + + + @type file + path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 0b26357f0..e68e4ff64 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -23,12 +23,6 @@ log_level debug - - type filter_cadvisor_health_node - log_level debug - - - #custom_metrics_mdm filter plugin type filter_cadvisor2mdm @@ -67,25 +61,6 @@ max_retry_wait 9m - - - @type forward - send_timeout 60s - recover_wait 10s - hard_timeout 60s - heartbeat_type tcp - - - host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_ENDPOINT']}" - port 25227 - - - - @type file - path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log - - - type out_mdm log_level debug diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 3dc1a18cd..0ea3bc984 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -30,6 +30,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/code/plugin/KubernetesApiClient.rb; 644; root; root /etc/opt/microsoft/docker-cimprov/container.conf; installer/conf/container.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/container-health.conf; installer/conf/container-health.conf; 644; root; root /opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/code/plugin/CAdvisorMetricsAPIClient.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_perf.rb; source/code/plugin/in_kube_perf.rb; 644; root; root @@ -257,6 +258,9 @@ chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_fai mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf +mv /etc/opt/microsoft/docker-cimprov/container-health.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container-health.conf +chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container-health.conf + %Postuninstall_10 # If we're an upgrade, skip all of this cleanup if ${{PERFORMING_UPGRADE_NOT}}; then @@ -268,6 +272,7 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/container.conf + rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/container-health.conf rmdir /var/opt/microsoft/docker-cimprov/log 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ContainerInventory 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ImageInventory 2> /dev/null From 2fee9fd3c1cfa31d143cc4b2174d40f426b15b3a Mon Sep 17 00:00:00 2001 From: ganga1980 Date: Mon, 23 Sep 2019 15:03:12 -0700 Subject: [PATCH 122/160] Gangams/add telemetry hybrid (#264) * add telemetry to detect the cloud, distro and kernel version * add null check since providerId optional * detect azurestack cloud * rename to KubernetesProviderID since ProviderID name already used in LA * capture workspaceCloud to the telemetry * trim the domain read from file --- .../code/plugin/ApplicationInsightsUtility.rb | 28 +++++++++++++++++++ source/code/plugin/in_kube_nodes.rb | 15 ++++++++++ 2 files changed, 43 insertions(+) diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index bb4831701..85b424e69 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -64,6 +64,7 @@ def initializeUtility() @@CustomProperties["ControllerType"] = ENV[@@EnvControllerType] encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] appInsightsEndpoint = ENV[@@EnvApplicationInsightsEndpoint] + @@CustomProperties["WorkspaceCloud"] = getWorkspaceCloud #Check if telemetry is turned off telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] @@ -230,5 +231,32 @@ def getWorkspaceId() $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") end end + + def getWorkspaceCloud() + begin + adminConf = {} + confFile = File.open(@OmsAdminFilePath, "r") + confFile.each_line do |line| + splitStrings = line.split("=") + adminConf[splitStrings[0]] = splitStrings[1] + end + workspaceDomain = adminConf["URL_TLD"].strip + workspaceCloud = "AzureCloud" + if workspaceDomain.casecmp("opinsights.azure.com") == 0 + workspaceCloud = "AzureCloud" + elsif workspaceDomain.casecmp("opinsights.azure.cn") == 0 + workspaceCloud = "AzureChinaCloud" + elsif workspaceDomain.casecmp("opinsights.azure.us") == 0 + workspaceCloud = "AzureUSGovernment" + elsif workspaceDomain.casecmp("opinsights.azure.de") == 0 + workspaceCloud = "AzureGermanCloud" + else + workspaceCloud = "Unknown" + end + return workspaceCloud + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: getWorkspaceCloud - error: #{errorStr}") + end + end end end diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 7249957ab..42bc13b68 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -8,6 +8,7 @@ class Kube_nodeInventory_Input < Input @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" + @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@ -84,6 +85,17 @@ def enumerate record["Labels"] = [items["metadata"]["labels"]] record["Status"] = "" + if !items["spec"]["providerID"].nil? && !items["spec"]["providerID"].empty? + if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack + record["KubernetesProviderID"] = "azurestack" + else + record["KubernetesProviderID"] = items["spec"]["providerID"] + end + else + record["KubernetesProviderID"] = "onprem" + end + + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" @@ -139,6 +151,9 @@ def enumerate properties["KubeletVersion"] = record["KubeletVersion"] properties["OperatingSystem"] = nodeInfo["operatingSystem"] properties["DockerVersion"] = dockerVersion + properties["KubernetesProviderID"] = record["KubernetesProviderID"] + properties["KernelVersion"] = nodeInfo["kernelVersion"] + properties["OSImage"] = nodeInfo["osImage"] capacityInfo = items["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) From 5eea104b5b5fc1b29ab978c2b2a501530efd6b6e Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 1 Oct 2019 17:00:46 -0700 Subject: [PATCH 123/160] KubeMonAgentEvents changes to collect configuration events (#267) * changes * changes * changes * changes * changes * changes * env changes * changes * changes * changes * reverting * changes * cahnges * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * chnages * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes --- installer/conf/td-agent-bit.conf | 2 +- installer/datafiles/base_container.data | 1 + installer/scripts/ConfigParseErrorLogger.rb | 21 ++ .../scripts/td-agent-bit-conf-customizer.rb | 3 +- .../scripts/tomlparser-prom-customconfig.rb | 19 +- installer/scripts/tomlparser.rb | 42 +-- source/code/go/src/plugins/oms.go | 313 +++++++++++++++++- source/code/go/src/plugins/out_oms.go | 5 +- source/code/go/src/plugins/telemetry.go | 10 +- 9 files changed, 378 insertions(+), 38 deletions(-) create mode 100644 installer/scripts/ConfigParseErrorLogger.rb diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 4e3de6c46..6a1bf3e3e 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -28,6 +28,7 @@ Path /var/log/containers/omsagent*.log DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db DB.Sync Off + Parser docker Mem_Buf_Limit 1m Path_Key filepath Skip_Long_Lines On @@ -51,7 +52,6 @@ [FILTER] Name grep Match oms.container.log.flbplugin.* - Exclude log E! [\[]inputs.prometheus[\]] [OUTPUT] Name oms diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 0ea3bc984..159550a90 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -116,6 +116,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root /opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root +/opt/ConfigParseErrorLogger.rb; installer/scripts/ConfigParseErrorLogger.rb; 755; root; root diff --git a/installer/scripts/ConfigParseErrorLogger.rb b/installer/scripts/ConfigParseErrorLogger.rb new file mode 100644 index 000000000..5d6db8016 --- /dev/null +++ b/installer/scripts/ConfigParseErrorLogger.rb @@ -0,0 +1,21 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +class ConfigParseErrorLogger + require "json" + + def initialize + end + + class << self + def logError(message) + begin + errorMessage = "config::error::" + message + jsonMessage = errorMessage.to_json + STDERR.puts jsonMessage + rescue => errorStr + puts "Error in ConfigParserErrorLogger::logError: #{errorStr}" + end + end + end +end diff --git a/installer/scripts/td-agent-bit-conf-customizer.rb b/installer/scripts/td-agent-bit-conf-customizer.rb index 1e62e3cc2..fae3acb36 100644 --- a/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/installer/scripts/td-agent-bit-conf-customizer.rb @@ -1,4 +1,5 @@ #!/usr/local/bin/ruby +require_relative "ConfigParseErrorLogger" @td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" @@ -40,7 +41,7 @@ def substituteFluentBitPlaceHolders File.open(@td_agent_bit_conf_path, "w") { |file| file.puts new_contents } puts "config::Successfully substituted the placeholders in td-agent-bit.conf file" rescue => errorStr - puts "td-agent-bit-config-customizer: error while substituting values: #{errorStr}" + ConfigParseErrorLogger.logError("td-agent-bit-config-customizer: error while substituting values in td-agent-bit.conf file: #{errorStr}") end end diff --git a/installer/scripts/tomlparser-prom-customconfig.rb b/installer/scripts/tomlparser-prom-customconfig.rb index d44bf3342..ab868f1a9 100644 --- a/installer/scripts/tomlparser-prom-customconfig.rb +++ b/installer/scripts/tomlparser-prom-customconfig.rb @@ -1,6 +1,7 @@ #!/usr/local/bin/ruby require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" require "fileutils" @promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@ -40,7 +41,7 @@ def parseConfigMap return nil end rescue => errorStr - puts "config::error::Exception while parsing toml config file for prometheus config: #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") return nil end end @@ -66,7 +67,7 @@ def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") rescue => errorStr - puts "config::error::Exception while replacing default pod monitor settings: #{errorStr}" + puts "Exception while replacing default pod monitor settings: #{errorStr}" end return new_contents end @@ -98,7 +99,7 @@ def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKu new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) return new_contents rescue => errorStr - puts "config::error::Exception while creating prometheus input plugins to filter namespaces: #{errorStr}, using defaults" + puts "Exception while creating prometheus input plugins to filter namespaces: #{errorStr}, using defaults" replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) end end @@ -181,10 +182,10 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Successfully created telemetry file for replicaset" end else - puts "config::Typecheck failed for prometheus config settings for replicaset, using defaults" + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") end # end of type check condition rescue => errorStr - puts "config::error::Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") setRsPromDefaults puts "****************End Prometheus Config Processing********************" end @@ -236,16 +237,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Successfully created telemetry file for daemonset" end else - puts "config::Typecheck failed for prometheus config settings for daemonset, using defaults" + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") end # end of type check condition rescue => errorStr - puts "config::error::Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") puts "****************End Prometheus Config Processing********************" end end # end of controller type check end else - puts "config::error:: Controller undefined while processing prometheus config, using defaults" + ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") end end @@ -258,7 +259,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) end else if (File.file?(@promConfigMapMountPath)) - puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") else puts "config::No configmap mounted for prometheus custom config, using defaults" end diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index b66e1257e..523f8c307 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -1,7 +1,8 @@ #!/usr/local/bin/ruby require_relative "tomlrb" -require 'json' +require_relative "ConfigParseErrorLogger" +require "json" @log_settings_config_map_mount_path = "/etc/config/settings/log-data-collection-settings" @agent_settings_config_map_mount_path = "/etc/config/settings/agent-settings" @@ -33,7 +34,7 @@ def parseConfigMap(path) return nil end rescue => errorStr - puts "config::error::Exception while parsing toml config file: #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while parsing config map for log collection/env variable settings: #{errorStr}, using defaults, please check config map for errors") @excludePath = "*_kube-system_*.log" return nil end @@ -70,7 +71,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end rescue => errorStr - puts "config::error::Exception while reading config settings for stdout log collection - #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while reading config map settings for stdout log collection - #{errorStr}, using defaults, please check config map for errors") end #Get stderr log config settings @@ -107,7 +108,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end rescue => errorStr - puts "config::error:Exception while reading config settings for stderr log collection - #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while reading config map settings for stderr log collection - #{errorStr}, using defaults, please check config map for errors") end #Get environment variables log config settings @@ -117,42 +118,43 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Using config map setting for cluster level environment variable collection" end rescue => errorStr - puts "config::error::Exception while reading config settings for cluster level environment variable collection - #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level environment variable collection - #{errorStr}, using defaults, please check config map for errors") end end begin if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? - @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] + @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] else - @enable_health_model = false + @enable_health_model = false end puts "enable_health_model = #{@enable_health_model}" rescue => errorStr - puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" + ConfigParseErrorLogger.logError("Exception while reading config map settings for health_model enabled setting - #{errorStr}, using defaults, please check config map for errors") @enable_health_model = false end end @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] puts "****************Start Config Processing********************" + if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = {} + configMapSettings = {} - #iterate over every *settings file and build a hash of settings - Dir["/etc/config/settings/*settings"].each{|file| - puts "Parsing File #{file}" - settings = parseConfigMap(file) - if !settings.nil? - configMapSettings = configMapSettings.merge(settings) - end - } + #iterate over every *settings file and build a hash of settings + Dir["/etc/config/settings/*settings"].each { |file| + puts "Parsing File #{file}" + settings = parseConfigMap(file) + if !settings.nil? + configMapSettings = configMapSettings.merge(settings) + end + } if !configMapSettings.nil? populateSettingValuesFromConfigMap(configMapSettings) end else - puts "config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults" + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") @excludePath = "*_kube-system_*.log" end @@ -178,13 +180,13 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") - #health_model settings + #health_model settings file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " puts "****************End Config Processing********************" else - puts "config::error::Exception while opening file for writing config environment variables" + puts "Exception while opening file for writing config environment variables" puts "****************End Config Processing********************" end diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index c5ad307d8..6d78455bd 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -28,6 +28,9 @@ const ContainerLogDataType = "CONTAINER_LOG_BLOB" // DataType for Insights metric const InsightsMetricsDataType = "INSIGHTS_METRICS_BLOB" +// DataType for KubeMonAgentEvent +const KubeMonAgentEventDataType = "KUBE_MON_AGENT_EVENTS_BLOB" + //env varibale which has ResourceId for LA const ResourceIdEnv = "AKS_RESOURCE_ID" @@ -46,6 +49,20 @@ const TelegrafTagClusterName = "clusterName" // clusterId tag const TelegrafTagClusterID = "clusterId" +const ConfigErrorEventCategory = "container.azm.ms/configmap" + +const PromScrapingErrorEventCategory = "container.azm.ms/promscraping" + +const NoErrorEventCategory = "container.azm.ms/noerror" + +const KubeMonAgentEventError = "Error" + +const KubeMonAgentEventWarning = "Warning" + +const KubeMonAgentEventInfo = "Info" + +const KubeMonAgentEventsFlushedEvent = "KubeMonAgentEventsFlushed" + // ContainerLogPluginConfFilePath --> config file path for container log plugin const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" @@ -54,6 +71,8 @@ const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimp const IPName = "Containers" const defaultContainerInventoryRefreshInterval = 60 +const kubeMonAgentConfigEventFlushInterval = 60 + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -71,6 +90,8 @@ var ( ResourceCentric bool //ResourceName ResourceName string + //KubeMonAgentEvents skip first flush + skipKubeMonEventsFlush bool ) var ( @@ -88,11 +109,19 @@ var ( ContainerLogTelemetryMutex = &sync.Mutex{} // ClientSet for querying KubeAPIs ClientSet *kubernetes.Clientset + // Config error hash + ConfigErrorEvent map[string]KubeMonAgentEventTags + // Prometheus scraping error hash + PromScrapeErrorEvent map[string]KubeMonAgentEventTags + // EventHashUpdateMutex read and write mutex access to the event hash + EventHashUpdateMutex = &sync.Mutex{} ) var ( // ContainerImageNameRefreshTicker updates the container image and names periodically ContainerImageNameRefreshTicker *time.Ticker + // KubeMonAgentConfigEventsSendTicker to send config events every hour + KubeMonAgentConfigEventsSendTicker *time.Ticker ) var ( @@ -142,6 +171,41 @@ type ContainerLogBlob struct { DataItems []DataItem `json:"DataItems"` } +// Config Error message to be sent to Log Analytics +type laKubeMonAgentEvents struct { + Computer string `json:"Computer"` + CollectionTime string `json:"CollectionTime"` //mapped to TimeGenerated + Category string `json:"Category"` + Level string `json:"Level"` + ClusterId string `json:"ClusterId"` + ClusterName string `json:"ClusterName"` + Message string `json:"Message"` + Tags string `json:"Tags"` +} + +type KubeMonAgentEventTags struct { + PodName string + ContainerId string + FirstOccurance string + LastOccurance string + Count int +} + +type KubeMonAgentEventBlob struct { + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []laKubeMonAgentEvents `json:"DataItems"` +} + +// KubeMonAgentEventType to be used as enum +type KubeMonAgentEventType int + +const ( + // KubeMonAgentEventType to be used as enum for ConfigError and ScrapingError + ConfigError KubeMonAgentEventType = iota + PromScrapingError +) + func createLogger() *log.Logger { var logfile *os.File path := "/var/opt/microsoft/docker-cimprov/log/fluent-bit-out-oms-runtime.log" @@ -262,6 +326,223 @@ func convert(in interface{}) (float64, bool) { } } +// PostConfigErrorstoLA sends config/prometheus scraping error log lines to LA +func populateKubeMonAgentEventHash(record map[interface{}]interface{}, errType KubeMonAgentEventType) { + var logRecordString = ToString(record["log"]) + var eventTimeStamp = ToString(record["time"]) + containerID, _, podName := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) + + Log("Locked EventHashUpdateMutex for updating hash \n ") + EventHashUpdateMutex.Lock() + switch errType { + case ConfigError: + // Doing this since the error logger library is adding quotes around the string and a newline to the end because + // we are converting string to json to log lines in different lines as one record + logRecordString = strings.TrimSuffix(logRecordString, "\n") + logRecordString = logRecordString[1 : len(logRecordString)-1] + + if val, ok := ConfigErrorEvent[logRecordString]; ok { + Log("In config error existing hash update\n") + eventCount := val.Count + eventFirstOccurance := val.FirstOccurance + + ConfigErrorEvent[logRecordString] = KubeMonAgentEventTags{ + PodName: podName, + ContainerId: containerID, + FirstOccurance: eventFirstOccurance, + LastOccurance: eventTimeStamp, + Count: eventCount + 1, + } + } else { + ConfigErrorEvent[logRecordString] = KubeMonAgentEventTags{ + PodName: podName, + ContainerId: containerID, + FirstOccurance: eventTimeStamp, + LastOccurance: eventTimeStamp, + Count: 1, + } + } + + case PromScrapingError: + // Splitting this based on the string 'E! [inputs.prometheus]: ' since the log entry has timestamp and we want to remove that before building the hash + var scrapingSplitString = strings.Split(logRecordString, "E! [inputs.prometheus]: ") + if scrapingSplitString != nil && len(scrapingSplitString) == 2 { + var splitString = scrapingSplitString[1] + // Trimming the newline character at the end since this is being added as the key + splitString = strings.TrimSuffix(splitString, "\n") + if splitString != "" { + if val, ok := PromScrapeErrorEvent[splitString]; ok { + Log("In config error existing hash update\n") + eventCount := val.Count + eventFirstOccurance := val.FirstOccurance + + PromScrapeErrorEvent[splitString] = KubeMonAgentEventTags{ + PodName: podName, + ContainerId: containerID, + FirstOccurance: eventFirstOccurance, + LastOccurance: eventTimeStamp, + Count: eventCount + 1, + } + } else { + PromScrapeErrorEvent[splitString] = KubeMonAgentEventTags{ + PodName: podName, + ContainerId: containerID, + FirstOccurance: eventTimeStamp, + LastOccurance: eventTimeStamp, + Count: 1, + } + } + } + } + } + EventHashUpdateMutex.Unlock() + Log("Unlocked EventHashUpdateMutex after updating hash \n ") +} + +// Function to get config error log records after iterating through the two hashes +func flushKubeMonAgentEventRecords() { + for ; true; <-KubeMonAgentConfigEventsSendTicker.C { + if skipKubeMonEventsFlush != true { + Log("In flushConfigErrorRecords\n") + start := time.Now() + var resp *http.Response + var postError error + var elapsed time.Duration + var laKubeMonAgentEventsRecords []laKubeMonAgentEvents + telemetryDimensions := make(map[string]string) + + telemetryDimensions["ConfigErrorEventCount"] = strconv.Itoa(len(ConfigErrorEvent)) + telemetryDimensions["PromScrapeErrorEventCount"] = strconv.Itoa(len(PromScrapeErrorEvent)) + + if (len(ConfigErrorEvent) > 0) || (len(PromScrapeErrorEvent) > 0) { + EventHashUpdateMutex.Lock() + Log("Locked EventHashUpdateMutex for reading hashes\n") + for k, v := range ConfigErrorEvent { + tagJson, err := json.Marshal(v) + + if err != nil { + message := fmt.Sprintf("Error while Marshalling config error event tags: %s", err.Error()) + Log(message) + SendException(message) + } else { + laKubeMonAgentEventsRecord := laKubeMonAgentEvents{ + Computer: Computer, + CollectionTime: start.Format(time.RFC3339), + Category: ConfigErrorEventCategory, + Level: KubeMonAgentEventError, + ClusterId: ResourceID, + ClusterName: ResourceName, + Message: k, + Tags: fmt.Sprintf("%s", tagJson), + } + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + } + } + + for k, v := range PromScrapeErrorEvent { + tagJson, err := json.Marshal(v) + if err != nil { + message := fmt.Sprintf("Error while Marshalling prom scrape error event tags: %s", err.Error()) + Log(message) + SendException(message) + } else { + laKubeMonAgentEventsRecord := laKubeMonAgentEvents{ + Computer: Computer, + CollectionTime: start.Format(time.RFC3339), + Category: PromScrapingErrorEventCategory, + Level: KubeMonAgentEventWarning, + ClusterId: ResourceID, + ClusterName: ResourceName, + Message: k, + Tags: fmt.Sprintf("%s", tagJson), + } + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + } + } + + //Clearing out the prometheus scrape hash so that it can be rebuilt with the errors in the next hour + for k := range PromScrapeErrorEvent { + delete(PromScrapeErrorEvent, k) + } + Log("PromScrapeErrorEvent cache cleared\n") + EventHashUpdateMutex.Unlock() + Log("Unlocked EventHashUpdateMutex for reading hashes\n") + } else { + //Sending a record in case there are no errors to be able to differentiate between no data vs no errors + tagsValue := KubeMonAgentEventTags{} + + tagJson, err := json.Marshal(tagsValue) + if err != nil { + message := fmt.Sprintf("Error while Marshalling no error tags: %s", err.Error()) + Log(message) + SendException(message) + } else { + laKubeMonAgentEventsRecord := laKubeMonAgentEvents{ + Computer: Computer, + CollectionTime: start.Format(time.RFC3339), + Category: NoErrorEventCategory, + Level: KubeMonAgentEventInfo, + ClusterId: ResourceID, + ClusterName: ResourceName, + Message: "No errors", + Tags: fmt.Sprintf("%s", tagJson), + } + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + } + } + + if len(laKubeMonAgentEventsRecords) > 0 { + kubeMonAgentEventEntry := KubeMonAgentEventBlob{ + DataType: KubeMonAgentEventDataType, + IPName: IPName, + DataItems: laKubeMonAgentEventsRecords} + + marshalled, err := json.Marshal(kubeMonAgentEventEntry) + + if err != nil { + message := fmt.Sprintf("Error while marshalling kubemonagentevent entry: %s", err.Error()) + Log(message) + SendException(message) + } else { + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) + req.Header.Set("Content-Type", "application/json") + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } + + resp, postError = HTTPClient.Do(req) + elapsed = time.Since(start) + + if postError != nil { + message := fmt.Sprintf("Error when sending kubemonagentevent request %s \n", err.Error()) + Log(message) + Log("Failed to flush %d records after %s", len(laKubeMonAgentEventsRecords), elapsed) + } else if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("Status %s Status Code %d", resp.Status, resp.StatusCode) + } + Log("Failed to flush %d records after %s", len(laKubeMonAgentEventsRecords), elapsed) + } else { + numRecords := len(laKubeMonAgentEventsRecords) + Log("Successfully flushed %d records in %s", numRecords, elapsed) + + // Send telemetry to AppInsights resource + SendEvent(KubeMonAgentEventsFlushedEvent, telemetryDimensions) + + } + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } + } + } + } else { + // Setting this to false to allow for subsequent flushes after the first hour + skipKubeMonEventsFlush = false + } + } +} + //Translates telegraf time series to one or more Azure loganalytics metric(s) func translateTelegrafMetrics(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { @@ -431,7 +712,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataUpdateMutex.Unlock() for _, record := range tailPluginRecords { - containerID, k8sNamespace := GetContainerIDK8sNamespaceFromFileName(ToString(record["filepath"])) + containerID, k8sNamespace, _ := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) logEntrySource := ToString(record["stream"]) if strings.EqualFold(logEntrySource, "stdout") { @@ -502,6 +783,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { SendException(message) return output.FLB_OK } + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) req.Header.Set("Content-Type", "application/json") //expensive to do string len for every request, so use a flag @@ -552,11 +834,12 @@ func containsKey(currentMap map[string]bool, key string) bool { return c } -// GetContainerIDK8sNamespaceFromFileName Gets the container ID From the file Name +// GetContainerIDK8sNamespacePodNameFromFileName Gets the container ID, k8s namespace and pod name From the file Name // sample filename kube-proxy-dgcx7_kube-system_kube-proxy-8df7e49e9028b60b5b0d0547f409c455a9567946cf763267b7e6fa053ab8c182.log -func GetContainerIDK8sNamespaceFromFileName(filename string) (string, string) { +func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, string, string) { id := "" ns := "" + podName := "" start := strings.LastIndex(filename, "-") end := strings.LastIndex(filename, ".") @@ -576,7 +859,16 @@ func GetContainerIDK8sNamespaceFromFileName(filename string) (string, string) { ns = filename[start+1 : end] } - return id, ns + start = strings.Index(filename, "/containers/") + end = strings.Index(filename, "_") + + if start >= end || start == -1 || end == -1 { + podName = "" + } else { + podName = filename[(start + len("/containers/")):end] + } + + return id, ns, podName } // InitializePlugin reads and populates plugin configuration @@ -586,6 +878,12 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { StderrIgnoreNsSet = make(map[string]bool) ImageIDMap = make(map[string]string) NameIDMap = make(map[string]string) + // Keeping the two error hashes separate since we need to keep the config error hash for the lifetime of the container + // whereas the prometheus scrape error hash needs to be refreshed every hour + ConfigErrorEvent = make(map[string]KubeMonAgentEventTags) + PromScrapeErrorEvent = make(map[string]KubeMonAgentEventTags) + // Initilizing this to true to skip the first kubemonagentevent flush since the errors are not populated at this time + skipKubeMonEventsFlush = true pluginConfig, err := ReadConfiguration(pluginConfPath) if err != nil { @@ -640,6 +938,9 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("containerInventoryRefreshInterval = %d \n", containerInventoryRefreshInterval) ContainerImageNameRefreshTicker = time.NewTicker(time.Second * time.Duration(containerInventoryRefreshInterval)) + Log("kubeMonAgentConfigEventFlushInterval = %d \n", kubeMonAgentConfigEventFlushInterval) + KubeMonAgentConfigEventsSendTicker = time.NewTicker(time.Minute * time.Duration(kubeMonAgentConfigEventFlushInterval)) + // Populate Computer field containerHostName, err := ioutil.ReadFile(pluginConfig["container_host_file_path"]) if err != nil { @@ -682,7 +983,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() go updateContainerImageNameMaps() + + // Flush config error records every hour + go flushKubeMonAgentEventRecords() } else { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } + } diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index e9e7124b7..1f1915798 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -1,14 +1,14 @@ package main import ( - "github.com/fluent/fluent-bit-go/output" "github.com/Microsoft/ApplicationInsights-Go/appinsights" + "github.com/fluent/fluent-bit-go/output" ) import ( "C" + "os" "strings" "unsafe" - "os" ) //export FLBPluginRegister @@ -61,6 +61,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { incomingTag := strings.ToLower(C.GoString(tag)) if strings.Contains(incomingTag, "oms.container.log.flbplugin") { + // This will also include populating cache to be sent as for config events return PushToAppInsightsTraces(records, appinsights.Information, incomingTag) } else if strings.Contains(incomingTag, "oms.container.perf.telegraf") { return PostTelegrafMetricsToLA(records) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 4f22b8c03..d5675187f 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -198,7 +198,15 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel contracts.SeverityLevel, tag string) int { var logLines []string for _, record := range records { - logLines = append(logLines, ToString(record["log"])) + // If record contains config error or prometheus scraping errors send it to KubeMonAgentEvents table + var logEntry = ToString(record["log"]) + if strings.Contains(logEntry, "config::error") { + populateKubeMonAgentEventHash(record, ConfigError) + } else if strings.Contains(logEntry, "E! [inputs.prometheus]") { + populateKubeMonAgentEventHash(record, PromScrapingError) + } else { + logLines = append(logLines, logEntry) + } } traceEntry := strings.Join(logLines, "\n") From c472b120c473f75e3895e3a5bd1adea96b95e250 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 26 Sep 2019 11:00:50 -0700 Subject: [PATCH 124/160] Fix the Dupe Perf Data Issue from the DaemonSet (#266) * Dupe Perf Record Fix --- installer/conf/container-health.conf | 4 ++-- installer/conf/kube.conf | 4 ++-- source/code/plugin/filter_health_model_builder.rb | 6 +++--- source/code/plugin/in_cadvisor_perf.rb | 4 ++-- source/code/plugin/in_kube_health.rb | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/installer/conf/container-health.conf b/installer/conf/container-health.conf index 4cb9e6913..e6edf41df 100644 --- a/installer/conf/container-health.conf +++ b/installer/conf/container-health.conf @@ -23,7 +23,7 @@ log_level debug - + type filter_cadvisor_health_node log_level debug @@ -68,7 +68,7 @@ - + @type forward send_timeout 60s recover_wait 10s diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 3cbc3ff17..8e1f6ae88 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -55,7 +55,7 @@ #Kubernetes health type kubehealth - tag oms.api.KubeHealth.ReplicaSet + tag kubehealth.ReplicaSet run_interval 60s log_level debug @@ -82,7 +82,7 @@ log_level info - + type filter_health_model_builder diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index 0c1b378a0..39452cb7e 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -84,7 +84,7 @@ def filter_stream(tag, es) new_es = MultiEventStream.new time = Time.now - if tag.start_with?("oms.api.KubeHealth.DaemonSet") + if tag.start_with?("kubehealth.DaemonSet") records = [] if !es.nil? es.each{|time, record| @@ -93,7 +93,7 @@ def filter_stream(tag, es) @buffer.add_to_buffer(records) end return [] - elsif tag.start_with?("oms.api.KubeHealth.ReplicaSet") + elsif tag.start_with?("kubehealth.ReplicaSet") @log.info "TAG #{tag}" records = [] es.each{|time, record| @@ -220,7 +220,7 @@ def filter_stream(tag, es) # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream es else - raise 'Invalid tag #{tag} received' + raise "Invalid tag #{tag} received" end rescue => e diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index 1702877a2..ce205322d 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -19,8 +19,8 @@ def initialize config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.api.cadvisorperf" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" - config_param :nodehealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Node" - #config_param :containerhealthtag, :string, :default => "oms.api.KubeHealth.DaemonSet.Container" + config_param :nodehealthtag, :string, :default => "kubehealth.DaemonSet.Node" + #config_param :containerhealthtag, :string, :default => "kubehealth.DaemonSet.Container" def configure(conf) super diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 199e03e56..5d29eb035 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -34,7 +34,7 @@ def initialize include HealthModel config_param :run_interval, :time, :default => "1m" - config_param :tag, :string, :default => "oms.api.KubeHealth.ReplicaSet" + config_param :tag, :string, :default => "kubehealth.ReplicaSet" def configure(conf) super From 98e4114bb499943fe60b06d5719f906f7b2b7b0d Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 3 Oct 2019 15:48:35 -0700 Subject: [PATCH 125/160] PR for 1. Container Memory CPU monitor 2. Configuration for Node Conditions 3. Fixed Type Changes 4. Use Env variable, and health_forward (that handles network errors at init) 5. Unit Tests (#268) --- installer/conf/container-health.conf | 103 - installer/conf/container.conf | 31 + installer/conf/health_model_definition.json | 91 +- installer/conf/healthmonitorconfig.json | 29 +- installer/conf/kube.conf | 15 +- installer/datafiles/base_container.data | 13 +- installer/scripts/tomlparser.rb | 50 +- .../filter_cadvisor_health_container.rb | 233 +- .../plugin/filter_cadvisor_health_node.rb | 131 +- .../plugin/filter_health_model_builder.rb | 78 +- .../plugin/health/agg_monitor_id_labels.rb | 11 +- .../health_container_cpu_memory_aggregator.rb | 258 + ...h_container_cpu_memory_record_formatter.rb | 34 + .../health/health_kube_api_down_handler.rb | 12 +- .../health/health_kubernetes_resources.rb | 291 +- .../health/health_missing_signal_generator.rb | 8 +- .../plugin/health/health_model_constants.rb | 88 +- .../plugin/health/health_monitor_helpers.rb | 42 +- .../plugin/health/health_monitor_provider.rb | 22 +- .../plugin/health/health_monitor_utils.rb | 41 +- .../plugin/health/health_signal_reducer.rb | 1 - .../plugin/health/parent_monitor_provider.rb | 5 +- source/code/plugin/health/unit_monitor.rb | 1 + source/code/plugin/in_cadvisor_perf.rb | 7 +- source/code/plugin/in_kube_events.rb | 2 +- source/code/plugin/in_kube_health.rb | 61 +- source/code/plugin/out_health_forward.rb | 677 ++ test/code/plugin/health/cadvisor_perf.json | 2540 +++++++ test/code/plugin/health/deployments.json | 1385 ++++ ...th_container_cpu_memory_aggregator_spec.rb | 190 + ...tainer_cpu_memory_record_formatter_spec.rb | 58 + .../health/health_kubernetes_resource_spec.rb | 26 +- .../health/health_model_builder_test.rb | 162 +- test/code/plugin/health/nodes.json | 1966 ++++++ .../health/parent_monitor_provider_spec.rb | 10 +- test/code/plugin/health/pods.json | 5987 +++++++++++++++++ 36 files changed, 13988 insertions(+), 671 deletions(-) delete mode 100644 installer/conf/container-health.conf create mode 100644 source/code/plugin/health/health_container_cpu_memory_aggregator.rb create mode 100644 source/code/plugin/health/health_container_cpu_memory_record_formatter.rb create mode 100644 source/code/plugin/out_health_forward.rb create mode 100644 test/code/plugin/health/cadvisor_perf.json create mode 100644 test/code/plugin/health/deployments.json create mode 100644 test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb create mode 100644 test/code/plugin/health/health_container_cpu_memory_record_formatter_spec.rb create mode 100644 test/code/plugin/health/nodes.json create mode 100644 test/code/plugin/health/pods.json diff --git a/installer/conf/container-health.conf b/installer/conf/container-health.conf deleted file mode 100644 index e6edf41df..000000000 --- a/installer/conf/container-health.conf +++ /dev/null @@ -1,103 +0,0 @@ -# Fluentd config file for OMS Docker - container components (non kubeAPI) - -# Forward port 25225 for container logs - - type forward - port 25225 - bind 127.0.0.1 - - -# Container inventory - - type containerinventory - tag oms.containerinsights.containerinventory - run_interval 60s - log_level debug - - -#cadvisor perf - - type cadvisorperf - tag oms.api.cadvisorperf - run_interval 60s - log_level debug - - - - type filter_cadvisor_health_node - log_level debug - - - -#custom_metrics_mdm filter plugin - - type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes - log_level info - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_cadvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - - @type forward - send_timeout 60s - recover_wait 10s - hard_timeout 60s - heartbeat_type tcp - - - host healthmodel-replicaset-service.kube-system - port 25227 - - - - @type file - path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log - - - - - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - retry_mdm_post_wait_minutes 60 - diff --git a/installer/conf/container.conf b/installer/conf/container.conf index e68e4ff64..5f08043c7 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -23,6 +23,16 @@ log_level debug + + type filter_cadvisor_health_node + log_level debug + + + + type filter_cadvisor_health_container + log_level debug + + #custom_metrics_mdm filter plugin type filter_cadvisor2mdm @@ -61,6 +71,27 @@ max_retry_wait 9m + + + @type health_forward + send_timeout 60s + recover_wait 10s + hard_timeout 60s + heartbeat_type tcp + skip_network_error_at_init true + expire_dns_cache 600s + + + host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}" + port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" + + + + @type file + path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + + + type out_mdm log_level debug diff --git a/installer/conf/health_model_definition.json b/installer/conf/health_model_definition.json index 1112fe158..e6c9e1808 100644 --- a/installer/conf/health_model_definition.json +++ b/installer/conf/health_model_definition.json @@ -23,6 +23,61 @@ "container.azm.ms/cluster-name" ] }, + { + "monitor_id": "container", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/container", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ], + "parent_monitor_id": [ + { + "label": "container.azm.ms/namespace", + "operator": "==", + "value": "kube-system", + "id": "system_workload" + }, + { + "label": "container.azm.ms/namespace", + "operator": "!=", + "value": "kube-system", + "id": "user_workload" + } + ] + }, + { + "monitor_id": "container_cpu_utilization", + "parent_monitor_id": "container", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/container", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, + { + "monitor_id": "container_memory_utilization", + "parent_monitor_id": "container", + "labels": [ + "container.azm.ms/namespace", + "container.azm.ms/workload-name", + "container.azm.ms/workload-kind", + "container.azm.ms/container", + "container.azm.ms/cluster-region", + "container.azm.ms/cluster-subscription-id", + "container.azm.ms/cluster-resource-group", + "container.azm.ms/cluster-name" + ] + }, { "monitor_id": "system_workload_pods_ready", "parent_monitor_id": "system_workload", @@ -104,6 +159,9 @@ "kubernetes.io/hostname", "agentpool", "kubernetes.io/role", + "node-role.kubernetes.io/master", + "node-role.kubernetes.io/compute", + "node-role.kubernetes.io/infra", "container.azm.ms/cluster-region", "container.azm.ms/cluster-subscription-id", "container.azm.ms/cluster-resource-group", @@ -117,6 +175,9 @@ "kubernetes.io/hostname", "agentpool", "kubernetes.io/role", + "node-role.kubernetes.io/master", + "node-role.kubernetes.io/compute", + "node-role.kubernetes.io/infra", "container.azm.ms/cluster-region", "container.azm.ms/cluster-subscription-id", "container.azm.ms/cluster-resource-group", @@ -130,6 +191,9 @@ "kubernetes.io/hostname", "agentpool", "kubernetes.io/role", + "node-role.kubernetes.io/master", + "node-role.kubernetes.io/compute", + "node-role.kubernetes.io/infra", "container.azm.ms/cluster-region", "container.azm.ms/cluster-subscription-id", "container.azm.ms/cluster-resource-group", @@ -143,12 +207,33 @@ "kubernetes.io/hostname", "agentpool", "kubernetes.io/role", + "node-role.kubernetes.io/master", + "node-role.kubernetes.io/compute", + "node-role.kubernetes.io/infra", "container.azm.ms/cluster-region", "container.azm.ms/cluster-subscription-id", "container.azm.ms/cluster-resource-group", "container.azm.ms/cluster-name" ], "parent_monitor_id": [ + { + "label": "node-role.kubernetes.io/master", + "operator": "==", + "value": "true", + "id": "master_node_pool" + }, + { + "label": "node-role.kubernetes.io/compute", + "operator": "==", + "value": "true", + "id": "agent_node_pool" + }, + { + "label": "node-role.kubernetes.io/infra", + "operator": "==", + "value": "true", + "id": "agent_node_pool" + }, { "label": "kubernetes.io/role", "operator": "==", @@ -161,14 +246,16 @@ "value": "agent", "id": "agent_node_pool" } - ] + ], + "default_parent_monitor_id": "agent_node_pool" }, { "monitor_id": "master_node_pool", "aggregation_algorithm": "percentage", "aggregation_algorithm_params": { "critical_threshold": 80.0, - "warning_threshold": 90.0 + "warning_threshold": 90.0, + "state_threshold": 80.0 }, "parent_monitor_id": "all_nodes", "labels": [ diff --git a/installer/conf/healthmonitorconfig.json b/installer/conf/healthmonitorconfig.json index 28d562652..ea6b23856 100644 --- a/installer/conf/healthmonitorconfig.json +++ b/installer/conf/healthmonitorconfig.json @@ -2,30 +2,41 @@ "node_cpu_utilization": { "WarnThresholdPercentage": 80.0, "FailThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3 + "ConsecutiveSamplesForStateTransition": 3, + "Operator": ">" }, "node_memory_utilization": { "WarnThresholdPercentage": 80.0, "FailThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3 + "ConsecutiveSamplesForStateTransition": 3, + "Operator": ">" }, "container_cpu_utilization": { "WarnThresholdPercentage": 80.0, "FailThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3 + "StateThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3, + "Operator": ">" }, "container_memory_utilization": { "WarnThresholdPercentage": 80.0, "FailThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3 + "StateThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3, + "Operator": ">" }, "user_workload_pods_ready": { - "WarnThresholdPercentage": 0.0, - "FailThresholdPercentage": 10.0, - "ConsecutiveSamplesForStateTransition": 2 + "WarnThresholdPercentage": 100.0, + "FailThresholdPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 2, + "Operator": "<" }, "system_workload_pods_ready": { - "FailThresholdPercentage": 0.0, - "ConsecutiveSamplesForStateTransition": 2 + "FailThresholdPercentage": 100.0, + "ConsecutiveSamplesForStateTransition": 2, + "Operator": "<" + }, + "node_condition": { + "NodeConditionTypesForFailedState": "outofdisk,networkunavailable" } } \ No newline at end of file diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 8e1f6ae88..40f4ac880 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -1,7 +1,7 @@ # Fluentd config file for OMS Docker - cluster components (kubeAPI) type forward - port 25227 + port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" bind 0.0.0.0 @@ -234,14 +234,17 @@ max_retry_wait 9m - - type out_oms_api + + type out_oms log_level debug - buffer_chunk_limit 10m + num_threads 5 + buffer_chunk_limit 20m buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_KubeHealth*.buffer - buffer_queue_limit 10 + buffer_path %STATE_DIR_WS%/out_oms_kubehealth*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk flush_interval 20s retry_limit 10 retry_wait 30s + max_retry_wait 9m \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 159550a90..981f51f4c 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -30,7 +30,6 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/code/plugin/KubernetesApiClient.rb; 644; root; root /etc/opt/microsoft/docker-cimprov/container.conf; installer/conf/container.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/container-health.conf; installer/conf/container-health.conf; 644; root; root /opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/code/plugin/CAdvisorMetricsAPIClient.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_perf.rb; source/code/plugin/in_kube_perf.rb; 644; root; root @@ -119,18 +118,20 @@ MAINTAINER: 'Microsoft Corporation' /opt/ConfigParseErrorLogger.rb; installer/scripts/ConfigParseErrorLogger.rb; 755; root; root - +/opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/code/plugin/filter_cadvisor_health_container.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_cadvisor_health_node.rb; source/code/plugin/filter_cadvisor_health_node.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_health_model_builder.rb; source/code/plugin/filter_health_model_builder.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_health.rb; source/code/plugin/in_kube_health.rb; 644; root; root +/opt/microsoft/omsagent/plugin/out_health_forward.rb; source/code/plugin/out_health_forward.rb; 644; root; root /etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json; installer/conf/healthmonitorconfig.json; 644; root; root /etc/opt/microsoft/docker-cimprov/health/health_model_definition.json; installer/conf/health_model_definition.json; 644; root; root - /opt/microsoft/omsagent/plugin/health/aggregate_monitor.rb; source/code/plugin/health/aggregate_monitor.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/agg_monitor_id_labels.rb; source/code/plugin/health/agg_monitor_id_labels.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/agg_monitor_id_labels.rb; source/code/plugin/health/agg_monitor_id_labels.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/aggregate_monitor_state_finalizer.rb; source/code/plugin/health/aggregate_monitor_state_finalizer.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/cluster_health_state.rb; source/code/plugin/health/cluster_health_state.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_container_cpu_memory_aggregator.rb; source/code/plugin/health/health_container_cpu_memory_aggregator.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_container_cpu_memory_record_formatter.rb; source/code/plugin/health/health_container_cpu_memory_record_formatter.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_hierarchy_builder.rb; source/code/plugin/health/health_hierarchy_builder.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_kubernetes_resources.rb; source/code/plugin/health/health_kubernetes_resources.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_kube_api_down_handler.rb; source/code/plugin/health/health_kube_api_down_handler.rb; 644; root; root @@ -259,9 +260,6 @@ chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_fai mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf -mv /etc/opt/microsoft/docker-cimprov/container-health.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container-health.conf -chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container-health.conf - %Postuninstall_10 # If we're an upgrade, skip all of this cleanup if ${{PERFORMING_UPGRADE_NOT}}; then @@ -273,7 +271,6 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/container.conf - rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/container-health.conf rmdir /var/opt/microsoft/docker-cimprov/log 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ContainerInventory 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ImageInventory 2> /dev/null diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index 523f8c307..cd16cbf9b 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -2,10 +2,8 @@ require_relative "tomlrb" require_relative "ConfigParseErrorLogger" -require "json" -@log_settings_config_map_mount_path = "/etc/config/settings/log-data-collection-settings" -@agent_settings_config_map_mount_path = "/etc/config/settings/agent-settings" +@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @configVersion = "" @configSchemaVersion = "" # Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist @@ -17,19 +15,18 @@ @logTailPath = "/var/log/containers/*.log" @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" @excludePath = "*.csv2" #some invalid path -@enable_health_model = false # Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap(path) +def parseConfigMap begin # Check to see if config map is created - if (File.file?(path)) - puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values from #{path}" - parsedConfig = Tomlrb.load_file(path, symbolize_keys: true) - puts "config::Successfully parsed mounted config map from #{path}" + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" return parsedConfig else - puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for #{path}" + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults" @excludePath = "*_kube-system_*.log" return nil end @@ -121,40 +118,19 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level environment variable collection - #{errorStr}, using defaults, please check config map for errors") end end - - begin - if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? - @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] - else - @enable_health_model = false - end - puts "enable_health_model = #{@enable_health_model}" - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while reading config map settings for health_model enabled setting - #{errorStr}, using defaults, please check config map for errors") - @enable_health_model = false - end end @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] puts "****************Start Config Processing********************" - if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = {} - - #iterate over every *settings file and build a hash of settings - Dir["/etc/config/settings/*settings"].each { |file| - puts "Parsing File #{file}" - settings = parseConfigMap(file) - if !settings.nil? - configMapSettings = configMapSettings.merge(settings) - end - } - + configMapSettings = parseConfigMap if !configMapSettings.nil? populateSettingValuesFromConfigMap(configMapSettings) end else - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end @excludePath = "*_kube-system_*.log" end @@ -180,8 +156,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") - #health_model settings - file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " @@ -189,4 +163,4 @@ def populateSettingValuesFromConfigMap(parsedConfig) else puts "Exception while opening file for writing config environment variables" puts "****************End Config Processing********************" -end +end \ No newline at end of file diff --git a/source/code/plugin/filter_cadvisor_health_container.rb b/source/code/plugin/filter_cadvisor_health_container.rb index 4090092a9..2eccd125f 100644 --- a/source/code/plugin/filter_cadvisor_health_container.rb +++ b/source/code/plugin/filter_cadvisor_health_container.rb @@ -5,66 +5,57 @@ module Fluent require 'logger' require 'json' require_relative 'oms_common' - require_relative 'HealthMonitorUtils' - require_relative 'HealthMonitorState' require_relative "ApplicationInsightsUtility" + Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } class CAdvisor2ContainerHealthFilter < Filter + include HealthModel Fluent::Plugin.register_filter('filter_cadvisor_health_container', self) config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/health_monitors.log' config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' config_param :container_resource_refresh_interval_minutes, :integer, :default => 5 - @@object_name_k8s_node = 'K8SNode' @@object_name_k8s_container = 'K8SContainer' - @@counter_name_cpu = 'cpuusagenanocores' @@counter_name_memory_rss = 'memoryrssbytes' - @@health_monitor_config = {} - - @@hostName = (OMS::Common.get_hostname) - @@clusterName = KubernetesApiClient.getClusterName - @@clusterId = KubernetesApiClient.getClusterId - @@clusterRegion = KubernetesApiClient.getClusterRegion - @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled - def initialize - super - @cpu_capacity = 0.0 - @memory_capacity = 0.0 - @last_resource_refresh = DateTime.now.to_time.to_i - @metrics_to_collect_hash = {} + begin + super + @metrics_to_collect_hash = {} + @formatter = HealthContainerCpuMemoryRecordFormatter.new + rescue => e + @log.info "Error in filter_cadvisor_health_container initialize #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end end def configure(conf) - super - @log = HealthMonitorUtils.getLogHandle - @log.debug {'Starting filter_cadvisor2health plugin'} + begin + super + @log = HealthMonitorUtils.get_log_handle + @log.debug {'Starting filter_cadvisor2health plugin'} + rescue => e + @log.info "Error in filter_cadvisor_health_container configure #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end end def start - super - @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) - @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" - node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) - @cpu_capacity = node_capacity[0] - @memory_capacity = node_capacity[1] - @log.info "CPU Capacity #{@cpu_capacity} Memory Capacity #{@memory_capacity}" - #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @@hostName) - @@health_monitor_config = HealthMonitorUtils.getHealthMonitorConfig - ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health Plugin Start", {}) + begin + super + @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) + ApplicationInsightsUtility.sendCustomEvent("filter_cadvisor_health_container Plugin Start", {}) + rescue => e + @log.info "Error in filter_cadvisor_health_container start #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end end def filter_stream(tag, es) - if !@@cluster_health_model_enabled - @log.info "Cluster Health Model disabled in filter_cadvisor_health_container" - return [] - end new_es = MultiEventStream.new - #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName) records_count = 0 es.each { |time, record| begin @@ -74,10 +65,11 @@ def filter_stream(tag, es) records_count += 1 end rescue => e - router.emit_error_event(tag, time, record, e) + @log.info "Error in filter_cadvisor_health_container filter_stream #{e.backtrace}" + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end } - @log.debug "Filter Records Count #{records_count}" + @log.debug "filter_cadvisor_health_container Records Count #{records_count}" new_es end @@ -88,176 +80,19 @@ def filter(tag, time, record) end object_name = record['DataItems'][0]['ObjectName'] counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase - if @metrics_to_collect_hash.key?(counter_name.downcase) - metric_value = record['DataItems'][0]['Collections'][0]['Value'] - case object_name - when @@object_name_k8s_container - case counter_name.downcase - when @@counter_name_cpu - # @log.debug "Object Name #{object_name}" - # @log.debug "Counter Name #{counter_name}" - # @log.debug "Metric Value #{metric_value}" - #return process_container_cpu_record(record, metric_value) - when @@counter_name_memory_rss - #return process_container_memory_record(record, metric_value) - end - when @@object_name_k8s_node - case counter_name.downcase - when @@counter_name_cpu - #process_node_cpu_record(record, metric_value) - when @@counter_name_memory_rss - #process_node_memory_record(record, metric_value) - end + if @metrics_to_collect_hash.key?(counter_name) + if object_name == @@object_name_k8s_container + return @formatter.get_record_from_cadvisor_record(record) end end + return nil rescue => e @log.debug "Error in filter #{e}" @log.debug "record #{record}" @log.debug "backtrace #{e.backtrace}" - ApplicationInsightsUtility.sendExceptionTelemetry(e) - return nil - end - end - - def process_container_cpu_record(record, metric_value) - monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID - @log.debug "processing container cpu record" - if record.nil? - return nil - else - instance_name = record['DataItems'][0]['InstanceName'] - key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) - container_metadata = HealthMonitorUtils.getContainerMetadata(key) - if !container_metadata.nil? - cpu_limit = container_metadata['cpuLimit'] - end - - if cpu_limit.to_s.empty? - #@log.info "CPU Limit is nil" - cpu_limit = @cpu_capacity - end - - #@log.info "cpu limit #{cpu_limit}" - - percent = (metric_value.to_f/cpu_limit*100).round(2) - #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}" - state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID]) - #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} - #@log.info health_monitor_record - - monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key]) - #@log.info "Monitor Instance Id: #{monitor_instance_id}" - temp = record.nil? ? "Nil" : record["MonitorInstanceId"] - @log.info "Processed Container CPU #{temp}" - return record - end - return nil - end - - def process_container_memory_record(record, metric_value) - monitor_id = HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID - #@log.debug "processing container memory record" - if record.nil? + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) return nil - else - instance_name = record['DataItems'][0]['InstanceName'] - key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) - container_metadata = HealthMonitorUtils.getContainerMetadata(key) - if !container_metadata.nil? - memory_limit = container_metadata['memoryLimit'] - end - - if memory_limit.to_s.empty? - #@log.info "Memory Limit is nil" - memory_limit = @memory_capacity - end - - #@log.info "memory limit #{memory_limit}" - - percent = (metric_value.to_f/memory_limit*100).round(2) - #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}" - state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID]) - #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} - #@log.info health_monitor_record - - monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName, key]) - #@log.info "Monitor Instance Id: #{monitor_instance_id}" - temp = record.nil? ? "Nil" : record["MonitorInstanceId"] - @log.info "Processed Container Memory #{temp}" - return record - end - return nil - end - - def process_node_cpu_record(record, metric_value) - monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID - #@log.debug "processing node cpu record" - if record.nil? - return nil - else - instance_name = record['DataItems'][0]['InstanceName'] - #@log.info "CPU capacity #{@cpu_capacity}" - - percent = (metric_value.to_f/@cpu_capacity*100).round(2) - #@log.debug "Percentage of CPU limit: #{percent}" - state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_CPU_MONITOR_ID]) - #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} - - monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(@log, monitor_id, [@@clusterId, @@hostName]) - # record = HealthMonitorSignalReducer.reduceSignal(@log, monitor_id, monitor_instance_id, @@health_monitor_config[monitor_id], node_name: @@hostName) - # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] - health_record = {} - time_now = Time.now.utc.iso8601 - health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id - health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id - health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now - health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now - health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName - @log.info "Processed Node CPU" - return health_record - end - return nil - end - - def process_node_memory_record(record, metric_value) - monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID - #@log.debug "processing node memory record" - if record.nil? - return nil - else - instance_name = record['DataItems'][0]['InstanceName'] - #@log.info "Memory capacity #{@memory_capacity}" - - percent = (metric_value.to_f/@memory_capacity*100).round(2) - #@log.debug "Percentage of Memory limit: #{percent}" - state = HealthMonitorState.computeHealthMonitorState(@log, monitor_id, percent, @@health_monitor_config[HealthMonitorConstants::NODE_MEMORY_MONITOR_ID]) - #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} - #@log.info health_monitor_record - - monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) - #@log.info "Monitor Instance Id: #{monitor_instance_id}" - # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] - health_record = {} - time_now = Time.now.utc.iso8601 - health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id - health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id - health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now - health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now - health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName - @log.info "Processed Node Memory" - return health_record end - return nil end end end diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb index faa574993..d2f735cd1 100644 --- a/source/code/plugin/filter_cadvisor_health_node.rb +++ b/source/code/plugin/filter_cadvisor_health_node.rb @@ -30,13 +30,10 @@ class CAdvisor2NodeHealthFilter < Filter @@clusterName = KubernetesApiClient.getClusterName @@clusterId = KubernetesApiClient.getClusterId @@clusterRegion = KubernetesApiClient.getClusterRegion - @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled def initialize begin super - @cpu_capacity = 0.0 - @memory_capacity = 0.0 @last_resource_refresh = DateTime.now.to_time.to_i @metrics_to_collect_hash = {} @resources = HealthKubernetesResources.instance # this doesnt require node and pod inventory. So no need to populate them @@ -59,6 +56,8 @@ def configure(conf) def start begin super + @cpu_capacity = 1.0 #avoid divide by zero error in case of network issues accessing kube-api + @memory_capacity = 1.0 @metrics_to_collect_hash = HealthMonitorUtils.build_metrics_hash(@metrics_to_collect) @log.debug "Calling ensure_cpu_memory_capacity_set cpu_capacity #{@cpu_capacity} memory_capacity #{@memory_capacity}" node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) @@ -73,27 +72,26 @@ def start end def filter_stream(tag, es) - if !@@cluster_health_model_enabled - @log.info "Cluster Health Model disabled in filter_cadvisor_health_node" - return MultiEventStream.new - end - new_es = MultiEventStream.new - #HealthMonitorUtils.refresh_kubernetes_api_data(@log, @hostName) - records_count = 0 - es.each { |time, record| - begin + begin + node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) + @cpu_capacity = node_capacity[0] + @memory_capacity = node_capacity[1] + new_es = MultiEventStream.new + records_count = 0 + es.each { |time, record| filtered_record = filter(tag, time, record) if !filtered_record.nil? new_es.add(time, filtered_record) records_count += 1 end - rescue => e - @log.info "Error in filter_stream for filter_cadvisor_health_node #{e.message}" + } + @log.debug "Filter Records Count #{records_count}" + return new_es + rescue => e + @log.info "Error in filter_cadvisor_health_node filter_stream #{e.backtrace}" ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) - end - } - @log.debug "Filter Records Count #{records_count}" - new_es + return MultiEventStream.new + end end def filter(tag, time, record) @@ -101,21 +99,12 @@ def filter(tag, time, record) if record.key?("MonitorLabels") return record end + object_name = record['DataItems'][0]['ObjectName'] counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase if @metrics_to_collect_hash.key?(counter_name.downcase) metric_value = record['DataItems'][0]['Collections'][0]['Value'] case object_name - when @@object_name_k8s_container - case counter_name.downcase - when @@counter_name_cpu - # @log.debug "Object Name #{object_name}" - # @log.debug "Counter Name #{counter_name}" - # @log.debug "Metric Value #{metric_value}" - #return process_container_cpu_record(record, metric_value) - when @@counter_name_memory_rss - #return process_container_memory_record(record, metric_value) - end when @@object_name_k8s_node case counter_name.downcase when @@counter_name_cpu @@ -134,82 +123,8 @@ def filter(tag, time, record) end end - def process_container_cpu_record(record, metric_value) - monitor_id = HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID - @log.debug "processing container cpu record" - if record.nil? - return nil - else - instance_name = record['DataItems'][0]['InstanceName'] - key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) - container_metadata = HealthMonitorUtils.getContainerMetadata(key) - if !container_metadata.nil? - cpu_limit = container_metadata['cpuLimit'] - end - - if cpu_limit.to_s.empty? - #@log.info "CPU Limit is nil" - cpu_limit = @cpu_capacity - end - - #@log.info "cpu limit #{cpu_limit}" - - percent = (metric_value.to_f/cpu_limit*100).round(2) - #@log.debug "Container #{key} | Percentage of CPU limit: #{percent}" - state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(monitor_id)) - #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} - #@log.info health_monitor_record - - monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key]) - #@log.info "Monitor Instance Id: #{monitor_instance_id}" - temp = record.nil? ? "Nil" : record["MonitorInstanceId"] - @log.info "Processed Container CPU #{temp}" - return record - end - return nil - end - - def process_container_memory_record(record, metric_value) - monitor_id = HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID - #@log.debug "processing container memory record" - if record.nil? - return nil - else - instance_name = record['DataItems'][0]['InstanceName'] - key = HealthMonitorUtils.getContainerKeyFromInstanceName(instance_name) - container_metadata = HealthMonitorUtils.getContainerMetadata(key) - if !container_metadata.nil? - memory_limit = container_metadata['memoryLimit'] - end - - if memory_limit.to_s.empty? - #@log.info "Memory Limit is nil" - memory_limit = @memory_capacity - end - - #@log.info "memory limit #{memory_limit}" - - percent = (metric_value.to_f/memory_limit*100).round(2) - #@log.debug "Container #{key} | Percentage of Memory limit: #{percent}" - state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID)) - #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} - #@log.info health_monitor_record - - monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName, key]) - #@log.info "Monitor Instance Id: #{monitor_instance_id}" - temp = record.nil? ? "Nil" : record["MonitorInstanceId"] - @log.info "Processed Container Memory #{temp}" - return record - end - return nil - end - def process_node_cpu_record(record, metric_value) - monitor_id = HealthMonitorConstants::NODE_CPU_MONITOR_ID + monitor_id = MonitorId::NODE_CPU_MONITOR_ID #@log.debug "processing node cpu record" if record.nil? return nil @@ -219,7 +134,7 @@ def process_node_cpu_record(record, metric_value) percent = (metric_value.to_f/@cpu_capacity*100).round(2) #@log.debug "Percentage of CPU limit: #{percent}" - state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_CPU_MONITOR_ID)) + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(MonitorId::NODE_CPU_MONITOR_ID)) #@log.debug "Computed State : #{state}" timestamp = record['DataItems'][0]['Timestamp'] health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} @@ -231,7 +146,7 @@ def process_node_cpu_record(record, metric_value) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName @log.info "Processed Node CPU" @@ -241,7 +156,7 @@ def process_node_cpu_record(record, metric_value) end def process_node_memory_record(record, metric_value) - monitor_id = HealthMonitorConstants::NODE_MEMORY_MONITOR_ID + monitor_id = MonitorId::NODE_MEMORY_MONITOR_ID #@log.debug "processing node memory record" if record.nil? return nil @@ -251,7 +166,7 @@ def process_node_memory_record(record, metric_value) percent = (metric_value.to_f/@memory_capacity*100).round(2) #@log.debug "Percentage of Memory limit: #{percent}" - state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(HealthMonitorConstants::NODE_MEMORY_MONITOR_ID)) + state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(MonitorId::NODE_MEMORY_MONITOR_ID)) #@log.debug "Computed State : #{state}" timestamp = record['DataItems'][0]['Timestamp'] health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} @@ -263,7 +178,7 @@ def process_node_memory_record(record, metric_value) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::NODE_NAME] = @@hostName @log.info "Processed Node Memory" diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index 39452cb7e..5aa7f610e 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -19,11 +19,10 @@ class FilterHealthModelBuilder < Filter attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator include HealthModel - @@rewrite_tag = 'oms.api.KubeHealth.AgentCollectionTime' + @@rewrite_tag = 'kubehealth.Signals' @@cluster_id = KubernetesApiClient.getClusterId @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled def initialize begin @@ -49,6 +48,7 @@ def initialize @state.initialize_state(deserialized_state_info) @cluster_old_state = 'none' @cluster_new_state = 'none' + @container_cpu_memory_records = [] rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -77,31 +77,45 @@ def shutdown def filter_stream(tag, es) begin - if !@@cluster_health_model_enabled - @log.info "Cluster Health Model disabled in filter_health_model_builder" - return [] - end new_es = MultiEventStream.new time = Time.now - if tag.start_with?("kubehealth.DaemonSet") - records = [] + if tag.start_with?("kubehealth.DaemonSet.Node") + node_records = [] + if !es.nil? + es.each{|time, record| + node_records.push(record) + } + @buffer.add_to_buffer(node_records) + end + return MultiEventStream.new + elsif tag.start_with?("kubehealth.DaemonSet.Container") + container_records = [] if !es.nil? es.each{|time, record| - records.push(record) + container_records.push(record) } - @buffer.add_to_buffer(records) end - return [] + container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) + deduped_records = container_records_aggregator.dedupe_records(container_records) + @container_cpu_memory_records.push(*deduped_records) # push the records for aggregation later + return MultiEventStream.new elsif tag.start_with?("kubehealth.ReplicaSet") - @log.info "TAG #{tag}" records = [] es.each{|time, record| records.push(record) } @buffer.add_to_buffer(records) + + container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) + container_records_aggregator.aggregate(@container_cpu_memory_records) + container_records_aggregator.compute_state + aggregated_container_records = container_records_aggregator.get_records + @buffer.add_to_buffer(aggregated_container_records) + records_to_process = @buffer.get_buffer @buffer.reset_buffer + @container_cpu_memory_records = [] health_monitor_records = [] records_to_process.each do |record| @@ -117,7 +131,6 @@ def filter_stream(tag, es) @provider.get_config(monitor_id), record[HealthMonitorRecordFields::DETAILS] ) - health_monitor_records.push(health_monitor_record) #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" end @@ -159,6 +172,8 @@ def filter_stream(tag, es) @log.info "after Adding missing signals all_records.size #{all_records.size}" + HealthMonitorHelpers.add_agentpool_node_label_if_not_present(all_records) + # build the health model @model_builder.process_records(all_records) all_monitors = @model_builder.finalize_model @@ -185,23 +200,36 @@ def filter_stream(tag, es) @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" + current_time = Time.now + emit_time = current_time.to_f # for each key in monitor.keys, # get the state from health_monitor_state # generate the record to send all_monitors.keys.each{|key| record = @provider.get_record(all_monitors[key], state) - if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER && all_monitors.size > 1 - old_state = record[HealthMonitorRecordFields::OLD_STATE] - new_state = record[HealthMonitorRecordFields::NEW_STATE] - if old_state != new_state && @cluster_old_state != old_state && @cluster_new_state != new_state - ApplicationInsightsUtility.sendCustomEvent("HealthModel_ClusterStateChanged",{"old_state" => old_state , "new_state" => new_state, "monitor_count" => all_monitors.size}) - @log.info "sent telemetry for cluster state change from #{record['OldState']} to #{record['NewState']}" - @cluster_old_state = old_state - @cluster_new_state = new_state + if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER + if !record[HealthMonitorRecordFields::DETAILS].nil? + details = JSON.parse(record[HealthMonitorRecordFields::DETAILS]) + details[HealthMonitorRecordFields::HEALTH_MODEL_DEFINITION_VERSION] = "#{ENV['HEALTH_MODEL_DEFINITION_VERSION']}" + record[HealthMonitorRecordFields::DETAILS] = details.to_json + end + if all_monitors.size > 1 + old_state = record[HealthMonitorRecordFields::OLD_STATE] + new_state = record[HealthMonitorRecordFields::NEW_STATE] + if old_state != new_state && @cluster_old_state != old_state && @cluster_new_state != new_state + ApplicationInsightsUtility.sendCustomEvent("HealthModel_ClusterStateChanged",{"old_state" => old_state , "new_state" => new_state, "monitor_count" => all_monitors.size}) + @log.info "sent telemetry for cluster state change from #{record['OldState']} to #{record['NewState']}" + @cluster_old_state = old_state + @cluster_new_state = new_state + end end end - #@log.info "#{record["Details"]} #{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" - new_es.add(time, record) + record_wrapper = { + "DataType" => "KUBE_HEALTH_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + new_es.add(emit_time, record_wrapper) } #emit the stream @@ -215,8 +243,8 @@ def filter_stream(tag, es) @cluster_health_state.update_state(@state.to_h) # return an empty event stream, else the match will throw a NoMethodError - return [] - elsif tag.start_with?("oms.api.KubeHealth.AgentCollectionTime") + return MultiEventStream.new + elsif tag.start_with?("kubehealth.Signals") # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream es else diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb index 86a3381cd..bb016adb4 100644 --- a/source/code/plugin/health/agg_monitor_id_labels.rb +++ b/source/code/plugin/health/agg_monitor_id_labels.rb @@ -1,5 +1,3 @@ -require_relative 'health_model_constants' - module HealthModel class AggregateMonitorInstanceIdLabels @@id_labels_mapping = { @@ -8,12 +6,9 @@ class AggregateMonitorInstanceIdLabels MonitorId::NODE => [HealthMonitorLabels::AGENTPOOL, HealthMonitorLabels::ROLE, HealthMonitorLabels::HOSTNAME], MonitorId::NAMESPACE => [HealthMonitorLabels::NAMESPACE], MonitorId::AGENT_NODE_POOL => [HealthMonitorLabels::AGENTPOOL], - # MonitorId::ALL_AGENT_NODE_POOLS => [], - # MonitorId::ALL_NODE_POOLS => [], - # MonitorId::ALL_NODES => [], - # MonitorId::K8S_INFRASTRUCTURE => [], - # MonitorId::CLUSTER => [], - # MonitorId::WORKLOAD => [] + MonitorId::CONTAINER => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME, HealthMonitorLabels::CONTAINER], + MonitorId::CONTAINER_CPU_MONITOR_ID => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME], + MonitorId::CONTAINER_MEMORY_MONITOR_ID => [HealthMonitorLabels::NAMESPACE, HealthMonitorLabels::WORKLOAD_NAME], } def self.get_labels_for(monitor_id) diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb new file mode 100644 index 000000000..e98c288b3 --- /dev/null +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -0,0 +1,258 @@ +require_relative 'health_model_constants' +=begin + @cpu_records/@memory_records + [ + { + "namespace_workload_container_name" : { + "limit" : limit, #number + "limit_set" : limit_set, #bool + "record_count" : record_count, #number + "workload_name": workload_name, + "workload_kind": workload_kind, + "namespace" : namespace, + "container": container, + records:[ + { + "counter_value": counter_value, + "pod_name": pod_name, + "container": container, + "state" : state + }, + { + "counter_value": counter_value, + "pod_name": pod_name, + "container": container, + "state" : state + } + ] + } + } + ] +=end +module HealthModel + # this class aggregates the records at the container level + class HealthContainerCpuMemoryAggregator + + attr_reader :pod_uid_lookup, :workload_container_count, :cpu_records, :memory_records, :provider + + @@memory_counter_name = 'memoryRssBytes' + @@cpu_counter_name = 'cpuUsageNanoCores' + def initialize(resources, provider) + @pod_uid_lookup = resources.get_pod_uid_lookup + @workload_container_count = resources.get_workload_container_count + @cpu_records = {} + @memory_records = {} + @log = HealthMonitorHelpers.get_log_handle + @provider = provider + end + + def dedupe_records(container_records) + cpu_deduped_instances = {} + memory_deduped_instances = {} + container_records = container_records.select{|record| record['CounterName'] == @@memory_counter_name || record['CounterName'] == @@cpu_counter_name} + + container_records.each do |record| + begin + instance_name = record["InstanceName"] + counter_name = record["CounterName"] + case counter_name + when @@memory_counter_name + resource_instances = memory_deduped_instances + when @@cpu_counter_name + resource_instances = cpu_deduped_instances + else + @log.info "Unexpected Counter Name #{counter_name}" + next + end + if !resource_instances.key?(instance_name) + resource_instances[instance_name] = record + else + r = resource_instances[instance_name] + if record["Timestamp"] > r["Timestamp"] + @log.info "Dropping older record" + resource_instances[instance_name] = record + end + end + rescue => e + @log.info "Exception when deduping record #{record}" + end + end + return cpu_deduped_instances.values.concat(memory_deduped_instances.values) + end + + def aggregate(container_records) + #filter and select only cpuUsageNanoCores and memoryRssBytes + container_records = container_records.select{|record| record['CounterName'] == @@memory_counter_name || record['CounterName'] == @@cpu_counter_name} + # poduid lookup has poduid/cname --> workload_name, namespace, cpu_limit, memory limit mapping + # from the container records, extract the poduid/cname, get the values from poduid_lookup, and aggregate based on namespace_workload_cname + container_records.each do |record| + begin + instance_name = record["InstanceName"] + lookup_key = instance_name.split('/').last(2).join('/') + if !@pod_uid_lookup.key?(lookup_key) + next + end + namespace = @pod_uid_lookup[lookup_key]['namespace'] + workload_name = @pod_uid_lookup[lookup_key]['workload_name'] + cname = lookup_key.split('/')[1] + counter_name = record["CounterName"] + case counter_name + when @@memory_counter_name + resource_hash = @memory_records + resource_type = 'memory' + when @@cpu_counter_name + resource_hash = @cpu_records + resource_type = 'cpu' + else + @log.info "Unexpected Counter Name #{counter_name}" + next + end + + # this is used as a look up from the pod_uid_lookup in kubernetes_health_resources object + resource_hash_key = "#{namespace}_#{workload_name.split('~~')[1]}_#{cname}" + + # if the resource map doesnt contain the key, add limit, count and records + if !resource_hash.key?(resource_hash_key) + resource_hash[resource_hash_key] = {} + resource_hash[resource_hash_key]["limit"] = @pod_uid_lookup[lookup_key]["#{resource_type}_limit"] + resource_hash[resource_hash_key]["limit_set"] = @pod_uid_lookup[lookup_key]["#{resource_type}_limit_set"] + resource_hash[resource_hash_key]["record_count"] = @workload_container_count[resource_hash_key] + resource_hash[resource_hash_key]["workload_name"] = @pod_uid_lookup[lookup_key]["workload_name"] + resource_hash[resource_hash_key]["workload_kind"] = @pod_uid_lookup[lookup_key]["workload_kind"] + resource_hash[resource_hash_key]["namespace"] = @pod_uid_lookup[lookup_key]["namespace"] + resource_hash[resource_hash_key]["container"] = @pod_uid_lookup[lookup_key]["container"] + resource_hash[resource_hash_key]["records"] = [] + end + + container_instance_record = {} + + pod_name = @pod_uid_lookup[lookup_key]["pod_name"] + #append the record to the hash + # append only if the record is not a duplicate record + container_instance_record["pod_name"] = pod_name + container_instance_record["counter_value"] = record["CounterValue"] + container_instance_record["container"] = @pod_uid_lookup[lookup_key]["container"] + container_instance_record["state"] = calculate_container_instance_state( + container_instance_record["counter_value"], + resource_hash[resource_hash_key]["limit"], + @provider.get_config(MonitorId::CONTAINER_MEMORY_MONITOR_ID)) + resource_hash[resource_hash_key]["records"].push(container_instance_record) + rescue => e + @log.info "Error in HealthContainerCpuMemoryAggregator aggregate #{e.backtrace} #{e.message} #{record}" + end + end + end + + def compute_state() + # if missing records, set state to unknown + # if limits not set, set state to warning + # if all records present, sort in descending order of metric, compute index based on StateThresholdPercentage, get the state (pass/fail/warn) based on monitor state (Using [Fail/Warn]ThresholdPercentage, and set the state) + @memory_records.each{|k,v| + calculate_monitor_state(v, @provider.get_config(MonitorId::CONTAINER_MEMORY_MONITOR_ID)) + } + + @cpu_records.each{|k,v| + calculate_monitor_state(v, @provider.get_config(MonitorId::CONTAINER_CPU_MONITOR_ID)) + } + + @log.info "Finished computing state" + end + + def get_records + time_now = Time.now.utc.iso8601 + container_cpu_memory_records = [] + + @cpu_records.each{|resource_key, record| + health_monitor_record = { + "timestamp" => time_now, + "state" => record["state"], + "details" => { + "cpu_limit_millicores" => record["limit"]/1000000.to_f, + "cpu_usage_instances" => record["records"].map{|r| r.each {|k,v| + k == "counter_value" ? r[k] = r[k] / 1000000.to_f : r[k] + }}, + "workload_name" => record["workload_name"], + "workload_kind" => record["workload_kind"], + "namespace" => record["namespace"], + "container" => record["container"], + "limit_set" => record["limit_set"] + } + } + + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(MonitorId::CONTAINER_CPU_MONITOR_ID, resource_key.split('_')) #container_cpu_utilization-namespace-workload-container + + health_record = {} + health_record[HealthMonitorRecordFields::MONITOR_ID] = MonitorId::CONTAINER_CPU_MONITOR_ID + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + container_cpu_memory_records.push(health_record) + } + + @memory_records.each{|resource_key, record| + health_monitor_record = { + "timestamp" => time_now, + "state" => record["state"], + "details" => { + "memory_limit_bytes" => record["limit"], + "memory_usage_instances" => record["records"], + "workload_name" => record["workload_name"], + "workload_kind" => record["workload_kind"], + "namespace" => record["namespace"], + "container" => record["container"] + } + } + + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(MonitorId::CONTAINER_MEMORY_MONITOR_ID, resource_key.split('_')) #container_cpu_utilization-namespace-workload-container + + health_record = {} + health_record[HealthMonitorRecordFields::MONITOR_ID] = MonitorId::CONTAINER_MEMORY_MONITOR_ID + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + container_cpu_memory_records.push(health_record) + } + return container_cpu_memory_records + end + + private + def calculate_monitor_state(v, config) + if !v['limit_set'] && v['namespace'] != 'kube-system' + v["state"] = HealthMonitorStates::WARNING + else + # sort records by descending order of metric + v["records"] = v["records"].sort_by{|record| record["counter_value"]}.reverse + size = v["records"].size + if size < v["record_count"] + unknown_count = v["record_count"] - size + for i in unknown_count.downto(1) + # it requires a lot of computation to figure out which actual pod is not sending the signal + v["records"].insert(0, {"counter_value" => -1, "container" => v["container"], "pod_name" => "???", "state" => HealthMonitorStates::UNKNOWN }) #insert -1 for unknown records + end + end + + if size == 1 + state_index = 0 + else + state_threshold = config['StateThresholdPercentage'].to_f + count = ((state_threshold*size)/100).ceil + state_index = size - count + end + v["state"] = v["records"][state_index]["state"] + end + end + + def calculate_container_instance_state(counter_value, limit, config) + percent_value = counter_value * 100 / limit + if percent_value > config['FailThresholdPercentage'] + return HealthMonitorStates::FAIL + elsif percent_value > config['WarnThresholdPercentage'] + return HealthMonitorStates::WARN + else + return HealthMonitorStates::PASS + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb new file mode 100644 index 000000000..5c7db82d9 --- /dev/null +++ b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb @@ -0,0 +1,34 @@ +module HealthModel + class HealthContainerCpuMemoryRecordFormatter + + @@health_container_cpu_memory_record_template = '{ + "InstanceName": "%{instance_name}", + "CounterName" : "%{counter_name}", + "CounterValue" : %{metric_value}, + "Timestamp" : "%{timestamp}" + }' + def initialize + @log = HealthMonitorHelpers.get_log_handle + end + + def get_record_from_cadvisor_record(cadvisor_record) + begin + instance_name = cadvisor_record['DataItems'][0]['InstanceName'] + counter_name = cadvisor_record['DataItems'][0]['Collections'][0]['CounterName'] + metric_value = cadvisor_record['DataItems'][0]['Collections'][0]['Value'] + timestamp = cadvisor_record['DataItems'][0]['Timestamp'] + + health_container_cpu_memory_record = @@health_container_cpu_memory_record_template % { + instance_name: instance_name, + counter_name: counter_name, + metric_value: metric_value, + timestamp: timestamp + } + return JSON.parse(health_container_cpu_memory_record) + rescue => e + @log.info "Error in get_record_from_cadvisor_record #{e.message} #{e.backtrace}" + return nil + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_kube_api_down_handler.rb b/source/code/plugin/health/health_kube_api_down_handler.rb index 7f72360f8..a87c43ef1 100644 --- a/source/code/plugin/health/health_kube_api_down_handler.rb +++ b/source/code/plugin/health/health_kube_api_down_handler.rb @@ -2,11 +2,11 @@ module HealthModel class HealthKubeApiDownHandler def initialize - @@monitors_to_change = [HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID, - HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID, - HealthMonitorConstants::NODE_CONDITION_MONITOR_ID, - HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID, - HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID] + @@monitors_to_change = [MonitorId::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID, + MonitorId::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID, + MonitorId::NODE_CONDITION_MONITOR_ID, + MonitorId::USER_WORKLOAD_PODS_READY_MONITOR_ID, + MonitorId::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID] end # update kube-api dependent monitors to be 'unknown' if kube-api is down or monitor is unavailable @@ -14,7 +14,7 @@ def handle_kube_api_down(health_monitor_records) health_monitor_records_map = {} health_monitor_records.map{|record| health_monitor_records_map[record.monitor_instance_id] = record} - if !health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) || (health_monitor_records_map.key?(HealthMonitorConstants::KUBE_API_STATUS) && health_monitor_records_map[HealthMonitorConstants::KUBE_API_STATUS].state != 'pass') + if !health_monitor_records_map.key?(MonitorId::KUBE_API_STATUS) || (health_monitor_records_map.key?(MonitorId::KUBE_API_STATUS) && health_monitor_records_map[MonitorId::KUBE_API_STATUS].state != 'pass') #iterate over the map and set the state to unknown for related monitors health_monitor_records.each{|health_monitor_record| if @@monitors_to_change.include?(health_monitor_record.monitor_id) diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb index 2f591722b..30a9ac7ca 100644 --- a/source/code/plugin/health/health_kubernetes_resources.rb +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -5,8 +5,8 @@ module HealthModel class HealthKubernetesResources include Singleton - attr_accessor :node_inventory, :pod_inventory, :deployment_inventory - attr_reader :nodes, :pods, :workloads + attr_accessor :node_inventory, :pod_inventory, :deployment_inventory, :pod_uid_lookup, :workload_container_count + attr_reader :nodes, :pods, :workloads, :deployment_lookup def initialize @node_inventory = [] @@ -16,6 +16,9 @@ def initialize @pods = [] @workloads = [] @log = HealthMonitorHelpers.get_log_handle + @pod_uid_lookup = {} + @deployment_lookup = {} + @workload_container_count = {} end def get_node_inventory @@ -33,71 +36,255 @@ def get_nodes return @nodes end - def get_pod_inventory - return @pod_inventory - end - - def get_pods - return @pods + def set_deployment_inventory(deployments) + @deployment_inventory = deployments + @deployment_lookup = {} end def get_workload_names - @pods = [] workload_names = {} - deployment_lookup = {} - @deployment_inventory['items'].each do |deployment| - match_labels = deployment['spec']['selector']['matchLabels'].to_h - namespace = deployment['metadata']['namespace'] - match_labels.each{|k,v| - deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" - } + @pod_inventory['items'].each do |pod| + workload_name = get_workload_name(pod) + workload_names[workload_name] = true if workload_name end + return workload_names.keys + end + + def build_pod_uid_lookup + @workload_container_count = {} @pod_inventory['items'].each do |pod| begin - has_owner = !pod['metadata']['ownerReferences'].nil? - owner_kind = '' - if has_owner - owner_kind = pod['metadata']['ownerReferences'][0]['kind'] - controller_name = pod['metadata']['ownerReferences'][0]['name'] - else - owner_kind = pod['kind'] - controller_name = pod['metadata']['name'] + namespace = pod['metadata']['namespace'] + poduid = pod['metadata']['uid'] + pod_name = pod['metadata']['name'] + workload_name = get_workload_name(pod) + workload_kind = get_workload_kind(pod) + # we don't show jobs in container health + if workload_kind.casecmp('job') == 0 + next + end + pod['spec']['containers'].each do |container| + cname = container['name'] + key = "#{poduid}/#{cname}" + cpu_limit_set = true + memory_limit_set = true + begin + cpu_limit = get_numeric_value('cpu', container['resources']['limits']['cpu']) + rescue => exception + #@log.info "Exception getting container cpu limit #{container['resources']}" + cpu_limit = get_node_capacity(pod['spec']['nodeName'], 'cpu') + cpu_limit_set = false + end + begin + memory_limit = get_numeric_value('memory', container['resources']['limits']['memory']) + rescue => exception + #@log.info "Exception getting container memory limit #{container['resources']}" + memory_limit = get_node_capacity(pod['spec']['nodeName'], 'memory') + memory_limit_set = false + end + @pod_uid_lookup[key] = {"workload_kind" => workload_kind, "workload_name" => workload_name, "namespace" => namespace, "cpu_limit" => cpu_limit, "memory_limit" => memory_limit, "cpu_limit_set" => cpu_limit_set, "memory_limit_set" => memory_limit_set, "container" => cname, "pod_name" => pod_name} + container_count_key = "#{namespace}_#{workload_name.split('~~')[1]}_#{cname}" + if !@workload_container_count.key?(container_count_key) + @workload_container_count[container_count_key] = 1 + else + count = @workload_container_count[container_count_key] + @workload_container_count[container_count_key] = count + 1 + end end + rescue => e + @log.info "Error in build_pod_uid_lookup #{pod} #{e.message}" + end + end + end - namespace = pod['metadata']['namespace'] + def get_pod_uid_lookup + return @pod_uid_lookup + end - workload_name = '' - if owner_kind.nil? - owner_kind = 'Pod' - end - case owner_kind.downcase - when 'job' - # we are excluding jobs - next - when 'replicaset' - # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name - labels = pod['metadata']['labels'].to_h - labels.each {|k,v| - lookup_key = "#{namespace}-#{k}=#{v}" - if deployment_lookup.key?(lookup_key) - workload_name = deployment_lookup[lookup_key] - break - end - } - if workload_name.empty? - workload_name = "#{namespace}~~#{controller_name}" + def get_workload_container_count + return @workload_container_count + end + + private + def get_workload_name(pod) + + if @deployment_lookup.empty? + @deployment_inventory['items'].each do |deployment| + match_labels = deployment['spec']['selector']['matchLabels'].to_h + namespace = deployment['metadata']['namespace'] + match_labels.each{|k,v| + @deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" + } + end + end + + begin + has_owner = !pod['metadata']['ownerReferences'].nil? + owner_kind = '' + if has_owner + owner_kind = pod['metadata']['ownerReferences'][0]['kind'] + controller_name = pod['metadata']['ownerReferences'][0]['name'] + else + owner_kind = pod['kind'] + controller_name = pod['metadata']['name'] + end + namespace = pod['metadata']['namespace'] + + workload_name = '' + if owner_kind.nil? + owner_kind = 'Pod' + end + case owner_kind.downcase + when 'job' + # we are excluding jobs + return nil + when 'replicaset' + # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name + labels = pod['metadata']['labels'].to_h + labels.each {|k,v| + lookup_key = "#{namespace}-#{k}=#{v}" + if @deployment_lookup.key?(lookup_key) + workload_name = @deployment_lookup[lookup_key] + break end - when 'daemonset' + } + if workload_name.empty? workload_name = "#{namespace}~~#{controller_name}" - else - workload_name = "#{namespace}~~#{pod['metadata']['name']}" end - rescue => e - @log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" + when 'daemonset' + workload_name = "#{namespace}~~#{controller_name}" + else + workload_name = "#{namespace}~~#{pod['metadata']['name']}" end - workload_names[workload_name] = true + return workload_name + rescue => e + @log.info "Error in get_workload_name(pod) #{e.message}" + return nil + end + end + + def get_workload_kind(pod) + if @deployment_lookup.empty? + @deployment_inventory['items'].each do |deployment| + match_labels = deployment['spec']['selector']['matchLabels'].to_h + namespace = deployment['metadata']['namespace'] + match_labels.each{|k,v| + @deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" + } + end + end + + begin + has_owner = !pod['metadata']['ownerReferences'].nil? + owner_kind = '' + if has_owner + owner_kind = pod['metadata']['ownerReferences'][0]['kind'] + else + owner_kind = pod['kind'] + end + + if owner_kind.nil? + owner_kind = 'Pod' + end + return owner_kind + rescue => e + @log.info "Error in get_workload_kind(pod) #{e.message}" + return nil end - return workload_names.keys end + + def get_node_capacity(node_name, type) + if node_name.nil? #unscheduled pods will not have a node name + return -1 + end + begin + @node_inventory["items"].each do |node| + if (!node["status"]["capacity"].nil?) && node["metadata"]["name"].casecmp(node_name.downcase) == 0 + return get_numeric_value(type, node["status"]["capacity"][type]) + end + end + rescue => e + @log.info "Error in get_node_capacity(pod, #{type}) #{e.backtrace} #{e.message}" + return -1 + end + end + + #Cannot reuse the code from KubernetesApiClient, for unit testing reasons. KubernetesApiClient has a dependency on oms_common.rb etc. + def get_numeric_value(metricName, metricVal) + metricValue = metricVal.downcase + begin + case metricName + when "memory" #convert to bytes for memory + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ + if (metricValue.end_with?("ki")) + metricValue.chomp!("ki") + metricValue = Float(metricValue) * 1024.0 ** 1 + elsif (metricValue.end_with?("mi")) + metricValue.chomp!("mi") + metricValue = Float(metricValue) * 1024.0 ** 2 + elsif (metricValue.end_with?("gi")) + metricValue.chomp!("gi") + metricValue = Float(metricValue) * 1024.0 ** 3 + elsif (metricValue.end_with?("ti")) + metricValue.chomp!("ti") + metricValue = Float(metricValue) * 1024.0 ** 4 + elsif (metricValue.end_with?("pi")) + metricValue.chomp!("pi") + metricValue = Float(metricValue) * 1024.0 ** 5 + elsif (metricValue.end_with?("ei")) + metricValue.chomp!("ei") + metricValue = Float(metricValue) * 1024.0 ** 6 + elsif (metricValue.end_with?("zi")) + metricValue.chomp!("zi") + metricValue = Float(metricValue) * 1024.0 ** 7 + elsif (metricValue.end_with?("yi")) + metricValue.chomp!("yi") + metricValue = Float(metricValue) * 1024.0 ** 8 + elsif (metricValue.end_with?("k")) + metricValue.chomp!("k") + metricValue = Float(metricValue) * 1000.0 ** 1 + elsif (metricValue.end_with?("m")) + metricValue.chomp!("m") + metricValue = Float(metricValue) * 1000.0 ** 2 + elsif (metricValue.end_with?("g")) + metricValue.chomp!("g") + metricValue = Float(metricValue) * 1000.0 ** 3 + elsif (metricValue.end_with?("t")) + metricValue.chomp!("t") + metricValue = Float(metricValue) * 1000.0 ** 4 + elsif (metricValue.end_with?("p")) + metricValue.chomp!("p") + metricValue = Float(metricValue) * 1000.0 ** 5 + elsif (metricValue.end_with?("e")) + metricValue.chomp!("e") + metricValue = Float(metricValue) * 1000.0 ** 6 + elsif (metricValue.end_with?("z")) + metricValue.chomp!("z") + metricValue = Float(metricValue) * 1000.0 ** 7 + elsif (metricValue.end_with?("y")) + metricValue.chomp!("y") + metricValue = Float(metricValue) * 1000.0 ** 8 + else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) + end + when "cpu" #convert to nanocores for cpu + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/ + if (metricValue.end_with?("m")) + metricValue.chomp!("m") + metricValue = Float(metricValue) * 1000.0 ** 2 + else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) * 1000.0 ** 3 + end + else + @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") + metricValue = 0 + end #case statement + rescue => error + @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 formetric value") + return 0 + end + return metricValue + end + end end \ No newline at end of file diff --git a/source/code/plugin/health/health_missing_signal_generator.rb b/source/code/plugin/health/health_missing_signal_generator.rb index 419680afa..1827a0190 100644 --- a/source/code/plugin/health/health_missing_signal_generator.rb +++ b/source/code/plugin/health/health_missing_signal_generator.rb @@ -24,14 +24,14 @@ def get_missing_signals(cluster_id, health_monitor_records, health_k8s_inventory node_signals_hash = {} nodes.each{|node| - node_signals_hash[node] = [HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID] + node_signals_hash[node] = [MonitorId::NODE_MEMORY_MONITOR_ID, MonitorId::NODE_CPU_MONITOR_ID, MonitorId::NODE_CONDITION_MONITOR_ID] } log = HealthMonitorHelpers.get_log_handle log.info "last_received_records #{@last_received_records.size} nodes #{nodes}" @last_received_records.each{|monitor_instance_id, monitor| if !health_monitor_records_map.key?(monitor_instance_id) if HealthMonitorHelpers.is_node_monitor(monitor.monitor_id) - node_name = monitor.labels['kubernetes.io/hostname'] + node_name = monitor.labels[HealthMonitorLabels::HOSTNAME] new_monitor = HealthMonitorRecord.new( monitor.monitor_id, monitor.monitor_instance_id, @@ -83,7 +83,7 @@ def get_missing_signals(cluster_id, health_monitor_records, health_k8s_inventory health_monitor_records.each{|health_monitor_record| # remove signals from the list of expected signals if we see them in the list of current signals if HealthMonitorHelpers.is_node_monitor(health_monitor_record.monitor_id) - node_name = health_monitor_record.labels['kubernetes.io/hostname'] + node_name = health_monitor_record.labels[HealthMonitorLabels::HOSTNAME] if node_signals_hash.key?(node_name) signals = node_signals_hash[node_name] signals.delete(health_monitor_record.monitor_id) @@ -111,7 +111,7 @@ def get_missing_signals(cluster_id, health_monitor_records, health_k8s_inventory {"timestamp" => Time.now.utc.iso8601, "state" => HealthMonitorStates::UNKNOWN, "details" => "no signal received from node #{node}"} ) missing_signals_map[monitor_instance_id] = new_monitor - log.info "Added missing signal when node_signals_hash was not empty #{new_monitor.monitor_instance_id} #{new_monitor.state}" + log.info "Added missing signal when node_signals_hash was not empty #{new_monitor.monitor_instance_id} #{new_monitor.state} #{new_monitor.labels.keys}" } } end diff --git a/source/code/plugin/health/health_model_constants.rb b/source/code/plugin/health/health_model_constants.rb index 82ae569f3..0922c7ff2 100644 --- a/source/code/plugin/health/health_model_constants.rb +++ b/source/code/plugin/health/health_model_constants.rb @@ -2,80 +2,80 @@ module HealthModel class MonitorState CRITICAL = "fail" ERROR = "err" - WARNING = "warn" - NONE = "none" HEALTHY = "pass" + NONE = "none" UNKNOWN = "unknown" + WARNING = "warn" end class AggregationAlgorithm - WORSTOF = "worstOf" PERCENTAGE = "percentage" + WORSTOF = "worstOf" end class MonitorId - CLUSTER = 'cluster'; - ALL_NODES = 'all_nodes'; - K8S_INFRASTRUCTURE = 'k8s_infrastructure' - - NODE = 'node'; AGENT_NODE_POOL = 'agent_node_pool' - MASTER_NODE_POOL = 'master_node_pool' ALL_AGENT_NODE_POOLS = 'all_agent_node_pools' - ALL_NODE_POOLS = 'all_node_pools'; - - WORKLOAD = 'all_workloads'; - CAPACITY = 'capacity'; - - USER_WORKLOAD = 'user_workload'; - SYSTEM_WORKLOAD = 'system_workload' + ALL_NODE_POOLS = 'all_node_pools' + ALL_NODES = 'all_nodes' + CAPACITY = 'capacity' + CLUSTER = 'cluster' + CONTAINER = 'container' + CONTAINER_CPU_MONITOR_ID = "container_cpu_utilization" + CONTAINER_MEMORY_MONITOR_ID = "container_memory_utilization" + K8S_INFRASTRUCTURE = 'k8s_infrastructure' + KUBE_API_STATUS = "kube_api_status" + MASTER_NODE_POOL = 'master_node_pool' NAMESPACE = 'namespace'; + NODE = 'node'; + NODE_CONDITION_MONITOR_ID = "node_condition" + NODE_CPU_MONITOR_ID = "node_cpu_utilization" + NODE_MEMORY_MONITOR_ID = "node_memory_utilization" + SYSTEM_WORKLOAD = 'system_workload' + SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID = "system_workload_pods_ready" + USER_WORKLOAD = 'user_workload'; + USER_WORKLOAD_PODS_READY_MONITOR_ID = "user_workload_pods_ready" + WORKLOAD = 'all_workloads'; + WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID = "container_cpu_utilization" + WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID = "container_memory_utilization" + WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_cpu" + WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_memory" end class HealthMonitorRecordFields CLUSTER_ID = "ClusterId" - MONITOR_ID = "MonitorId" - MONITOR_INSTANCE_ID = "MonitorInstanceId" - MONITOR_LABELS = "MonitorLabels" DETAILS = "Details" + HEALTH_MODEL_DEFINITION_VERSION = "HealthModelDefinitionVersion" MONITOR_CONFIG = "MonitorConfig" - OLD_STATE = "OldState" + MONITOR_ID = "MonitorTypeId" + MONITOR_INSTANCE_ID = "MonitorInstanceId" + MONITOR_LABELS = "MonitorLabels" NEW_STATE = "NewState" - AGENT_COLLECTION_TIME = "AgentCollectionTime" - TIME_FIRST_OBSERVED = "TimeFirstObserved" NODE_NAME = "NodeName" - NAMESPACE = "Namespace" - end - - class HealthMonitorConstants - NODE_CPU_MONITOR_ID = "node_cpu_utilization" - NODE_MEMORY_MONITOR_ID = "node_memory_utilization" - CONTAINER_CPU_MONITOR_ID = "container_cpu_utilization" - CONTAINER_MEMORY_MONITOR_ID = "container_memory_utilization" - NODE_CONDITION_MONITOR_ID = "node_condition" - WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_cpu" - WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID = "subscribed_capacity_memory" - WORKLOAD_CONTAINER_CPU_PERCENTAGE_MONITOR_ID = "container_cpu_utilization" - WORKLOAD_CONTAINER_MEMORY_PERCENTAGE_MONITOR_ID = "container_memory_utilization" - KUBE_API_STATUS = "kube_api_status" - USER_WORKLOAD_PODS_READY_MONITOR_ID = "user_workload_pods_ready" - SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID = "system_workload_pods_ready" + OLD_STATE = "OldState" + PARENT_MONITOR_INSTANCE_ID = "ParentMonitorInstanceId" + TIME_FIRST_OBSERVED = "TimeFirstObserved" + TIME_GENERATED = "TimeGenerated" end class HealthMonitorStates - PASS = "pass" FAIL = "fail" - WARNING = "warn" NONE = "none" + PASS = "pass" UNKNOWN = "unknown" + WARNING = "warn" end class HealthMonitorLabels - WORKLOAD_NAME = "container.azm.ms/workload-name" - WORKLOAD_KIND = "container.azm.ms/workload-kind" - NAMESPACE = "container.azm.ms/namespace" AGENTPOOL = "agentpool" - ROLE = "kubernetes.io/role" + CONTAINER = "container.azm.ms/container" HOSTNAME = "kubernetes.io/hostname" + NAMESPACE = "container.azm.ms/namespace" + ROLE = "kubernetes.io/role" + WORKLOAD_KIND = "container.azm.ms/workload-kind" + WORKLOAD_NAME = "container.azm.ms/workload-name" + MASTERROLE = "node-role.kubernetes.io/master" + COMPUTEROLE = "node-role.kubernetes.io/compute" + INFRAROLE = "node-role.kubernetes.io/infra" end end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb index 9f0315978..4efd4c608 100644 --- a/source/code/plugin/health/health_monitor_helpers.rb +++ b/source/code/plugin/health/health_monitor_helpers.rb @@ -16,11 +16,11 @@ class HealthMonitorHelpers class << self def is_node_monitor(monitor_id) - return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID) + return (monitor_id == MonitorId::NODE_CPU_MONITOR_ID || monitor_id == MonitorId::NODE_MEMORY_MONITOR_ID || monitor_id == MonitorId::NODE_CONDITION_MONITOR_ID) end def is_pods_ready_monitor(monitor_id) - return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + return (monitor_id == MonitorId::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == MonitorId::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) end def get_log_handle @@ -31,6 +31,44 @@ def get_monitor_instance_id(monitor_id, args = []) string_to_hash = args.join("/") return "#{monitor_id}-#{Digest::MD5.hexdigest(string_to_hash)}" end + + def add_agentpool_node_label_if_not_present(records) + records.each{|record| + # continue if it is not a node monitor + if !is_node_monitor(record.monitor_id) + #@log.info "#{record.monitor_id} is not a NODE MONITOR" + next + end + labels_keys = record.labels.keys + + if labels_keys.include?(HealthMonitorLabels::AGENTPOOL) + @log.info "#{record.monitor_id} includes agentpool label. Value = #{record.labels[HealthMonitorLabels::AGENTPOOL]}" + @log.info "Labels present = #{labels_keys}" + next + else + #@log.info "#{record} does not include agentpool label." + @log.info "Labels present = #{labels_keys}" + role_name = 'unknown' + if record.labels.include?(HealthMonitorLabels::ROLE) + role_name = record.labels[HealthMonitorLabels::ROLE] + elsif record.labels.include?(HealthMonitorLabels::MASTERROLE) + if !record.labels[HealthMonitorLabels::MASTERROLE].empty? + role_name = 'master' + end + elsif record.labels.include?(HealthMonitorLabels::COMPUTEROLE) + if !record.labels[HealthMonitorLabels::COMPUTEROLE].empty? + role_name = 'compute' + end + elsif record.labels.include?(HealthMonitorLabels::INFRAROLE) + if !record.labels[HealthMonitorLabels::INFRAROLE].empty? + role_name = 'infra' + end + end + @log.info "Adding agentpool label #{role_name}_node_pool for #{record.monitor_id}" + record.labels[HealthMonitorLabels::AGENTPOOL] = "#{role_name}_node_pool" + end + } + end end end diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb index 60ad69d76..e75824268 100644 --- a/source/code/plugin/health/health_monitor_provider.rb +++ b/source/code/plugin/health/health_monitor_provider.rb @@ -66,8 +66,9 @@ def get_record(health_monitor_record, health_monitor_state) monitor_record[HealthMonitorRecordFields::OLD_STATE] = old_state monitor_record[HealthMonitorRecordFields::DETAILS] = details.to_json monitor_record[HealthMonitorRecordFields::MONITOR_CONFIG] = config.to_json - monitor_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = Time.now.utc.iso8601 + monitor_record[HealthMonitorRecordFields::TIME_GENERATED] = Time.now.utc.iso8601 monitor_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_first_observed + monitor_record[HealthMonitorRecordFields::PARENT_MONITOR_INSTANCE_ID] = '' return monitor_record end @@ -87,17 +88,28 @@ def get_labels(health_monitor_record) } monitor_id = health_monitor_record[HealthMonitorRecordFields::MONITOR_ID] case monitor_id - when HealthMonitorConstants::CONTAINER_CPU_MONITOR_ID, HealthMonitorConstants::CONTAINER_MEMORY_MONITOR_ID, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID + when MonitorId::CONTAINER_CPU_MONITOR_ID, MonitorId::CONTAINER_MEMORY_MONITOR_ID, MonitorId::USER_WORKLOAD_PODS_READY_MONITOR_ID, MonitorId::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID namespace = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['namespace'] - workload_name = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadName'] - workload_kind = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workloadKind'] + workload_name = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workload_name'] + workload_kind = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['workload_kind'] monitor_labels[HealthMonitorLabels::WORKLOAD_NAME] = workload_name.split('~~')[1] monitor_labels[HealthMonitorLabels::WORKLOAD_KIND] = workload_kind monitor_labels[HealthMonitorLabels::NAMESPACE] = namespace - when HealthMonitorConstants::NODE_CPU_MONITOR_ID, HealthMonitorConstants::NODE_MEMORY_MONITOR_ID, HealthMonitorConstants::NODE_CONDITION_MONITOR_ID + # add the container name for container memory/cpu + if monitor_id == MonitorId::CONTAINER_CPU_MONITOR_ID || monitor_id == MonitorId::CONTAINER_MEMORY_MONITOR_ID + container = health_monitor_record[HealthMonitorRecordFields::DETAILS]['details']['container'] + monitor_labels[HealthMonitorLabels::CONTAINER] = container + end + + #TODO: This doesn't belong here. Move this elsewhere + health_monitor_record[HealthMonitorRecordFields::DETAILS]['details'].delete('namespace') + health_monitor_record[HealthMonitorRecordFields::DETAILS]['details'].delete('workload_name') + health_monitor_record[HealthMonitorRecordFields::DETAILS]['details'].delete('workload_kind') + + when MonitorId::NODE_CPU_MONITOR_ID, MonitorId::NODE_MEMORY_MONITOR_ID, MonitorId::NODE_CONDITION_MONITOR_ID node_name = health_monitor_record[HealthMonitorRecordFields::NODE_NAME] @health_kubernetes_resources.get_node_inventory['items'].each do |node| if !node_name.nil? && !node['metadata']['name'].nil? && node_name == node['metadata']['name'] diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index e707651dc..27e9b9a6e 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -36,21 +36,31 @@ def compute_percentage_state(value, config) end fail_percentage = config['FailThresholdPercentage'].to_f - if value > fail_percentage - return HealthMonitorStates::FAIL - elsif !warn_percentage.nil? && value > warn_percentage - return HealthMonitorStates::WARNING + if !config.nil? && !config['Operator'].nil? && config['Operator'] == '<' + if value < fail_percentage + return HealthMonitorStates::FAIL + elsif !warn_percentage.nil? && value < warn_percentage + return HealthMonitorStates::WARNING + else + return HealthMonitorStates::PASS + end else - return HealthMonitorStates::PASS + if value > fail_percentage + return HealthMonitorStates::FAIL + elsif !warn_percentage.nil? && value > warn_percentage + return HealthMonitorStates::WARNING + else + return HealthMonitorStates::PASS + end end end def is_node_monitor(monitor_id) - return (monitor_id == HealthMonitorConstants::NODE_CPU_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_MEMORY_MONITOR_ID || monitor_id == HealthMonitorConstants::NODE_CONDITION_MONITOR_ID) + return (monitor_id == MonitorId::NODE_CPU_MONITOR_ID || monitor_id == MonitorId::NODE_MEMORY_MONITOR_ID || monitor_id == MonitorId::NODE_CONDITION_MONITOR_ID) end def is_pods_ready_monitor(monitor_id) - return (monitor_id == HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + return (monitor_id == MonitorId::USER_WORKLOAD_PODS_READY_MONITOR_ID || monitor_id == MonitorId::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) end def is_cluster_health_model_enabled @@ -136,13 +146,23 @@ def get_pods_ready_hash(pod_inventory, deployment_inventory) return pods_ready_percentage_hash end - def get_node_state_from_node_conditions(node_conditions) + def get_node_state_from_node_conditions(monitor_config, node_conditions) pass = false + failtypes = ['outofdisk', 'networkunavailable'].to_set #default fail types + if !monitor_config.nil? && !monitor_config["NodeConditionTypesForFailedState"].nil? + failtypes = monitor_config["NodeConditionTypesForFailedState"] + if !failtypes.nil? + failtypes = failtypes.split(',').map{|x| x.downcase}.map{|x| x.gsub(" ","")}.to_set + end + end + log = get_log_handle + #log.info "Fail Types #{failtypes.inspect}" node_conditions.each do |condition| type = condition['type'] status = condition['status'] - if ((type == "NetworkUnavailable" || type == "OutOfDisk") && (status == 'True' || status == 'Unknown')) + #for each condition in the configuration, check if the type is not false. If yes, update state to fail + if (failtypes.include?(type.downcase) && (status == 'True' || status == 'Unknown')) return "fail" elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) return "warn" @@ -280,11 +300,12 @@ def get_monitor_instance_id(monitor_id, args = []) def ensure_cpu_memory_capacity_set(log, cpu_capacity, memory_capacity, hostname) log.info "ensure_cpu_memory_capacity_set cpu_capacity #{cpu_capacity} memory_capacity #{memory_capacity}" - if cpu_capacity != 0.0 && memory_capacity != 0.0 + if cpu_capacity != 1.0 && memory_capacity != 1.0 log.info "CPU And Memory Capacity are already set" return [cpu_capacity, memory_capacity] end + log.info "CPU and Memory Capacity Not set" begin @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) rescue Exception => e diff --git a/source/code/plugin/health/health_signal_reducer.rb b/source/code/plugin/health/health_signal_reducer.rb index 1d520da8d..f92f24ac3 100644 --- a/source/code/plugin/health/health_signal_reducer.rb +++ b/source/code/plugin/health/health_signal_reducer.rb @@ -20,7 +20,6 @@ def reduce_signals(health_monitor_records, health_k8s_inventory) if reduced_signals_map.key?(monitor_instance_id) record = reduced_signals_map[monitor_instance_id] if health_monitor_record.transition_date_time > record.transition_date_time # always take the latest record for a monitor instance id - puts 'Duplicate Daemon Set signal' reduced_signals_map[monitor_instance_id] = health_monitor_record end elsif HealthMonitorHelpers.is_node_monitor(monitor_id) diff --git a/source/code/plugin/health/parent_monitor_provider.rb b/source/code/plugin/health/parent_monitor_provider.rb index 4577abb99..4ab6e6297 100644 --- a/source/code/plugin/health/parent_monitor_provider.rb +++ b/source/code/plugin/health/parent_monitor_provider.rb @@ -8,6 +8,7 @@ def initialize(definition) @health_model_definition = definition @parent_monitor_mapping = {} #monitorId --> parent_monitor_id mapping @parent_monitor_instance_mapping = {} #child monitor id -- > parent monitor instance mapping. Used in instances when the node no longer exists and impossible to compute from kube api results + @log = HealthMonitorHelpers.get_log_handle end # gets the parent monitor id given the state transition. It requires the monitor id and labels to determine the parent id @@ -35,14 +36,13 @@ def get_parent_monitor_id(monitor) op = "#{condition['operator']}" right = "#{condition['value']}" cond = left.send(op.to_sym, right) - if cond @parent_monitor_mapping[monitor.monitor_instance_id] = condition['parent_id'] return condition['parent_id'] end } end - raise "Conditions were not met to determine the parent monitor id" if monitor_id != MonitorId::CLUSTER + return @health_model_definition[monitor_id]['default_parent_monitor_id'] end else raise "Invalid Monitor Id #{monitor_id} in get_parent_monitor_id" @@ -81,6 +81,7 @@ def get_parent_monitor_instance_id(monitor_instance_id, parent_monitor_id, paren end parent_monitor_instance_id = "#{parent_monitor_id}-#{values.join('-')}" @parent_monitor_instance_mapping[monitor_instance_id] = parent_monitor_instance_id + @log.info "parent_monitor_instance_id for #{monitor_instance_id} => #{parent_monitor_instance_id}" return parent_monitor_instance_id end end diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb index 64262aa2e..9af599321 100644 --- a/source/code/plugin/health/unit_monitor.rb +++ b/source/code/plugin/health/unit_monitor.rb @@ -1,3 +1,4 @@ +require_relative 'health_model_constants' require 'json' module HealthModel diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index ce205322d..810fb512f 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -20,7 +20,7 @@ def initialize config_param :tag, :string, :default => "oms.api.cadvisorperf" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" config_param :nodehealthtag, :string, :default => "kubehealth.DaemonSet.Node" - #config_param :containerhealthtag, :string, :default => "kubehealth.DaemonSet.Container" + config_param :containerhealthtag, :string, :default => "kubehealth.DaemonSet.Container" def configure(conf) super @@ -54,12 +54,11 @@ def enumerate() record["DataType"] = "LINUX_PERF_BLOB" record["IPName"] = "LogManagement" eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end + end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream - #router.emit_stream(@containerhealthtag, eventStream) if eventStream + router.emit_stream(@containerhealthtag, eventStream) if eventStream router.emit_stream(@nodehealthtag, eventStream) if eventStream @@istestvar = ENV["ISTEST"] diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index f177b62bf..5538ba4aa 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -84,7 +84,7 @@ def enumerate(eventList = nil) else record["Computer"] = (OMS::Common.get_hostname) end - record["ClusterName"] = KubernetesApiClient.getClusterName + record['ClusterName'] = KubernetesApiClient.getClusterName record["ClusterId"] = KubernetesApiClient.getClusterId wrapper = { "DataType" => "KUBE_EVENTS_BLOB", diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 5d29eb035..9a1b8f9a9 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -26,7 +26,6 @@ def initialize @@cluster_id = KubernetesApiClient.getClusterId @resources = HealthKubernetesResources.instance @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) - @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -55,9 +54,7 @@ def start @@clusterCpuCapacity = cluster_capacity[0] @@clusterMemoryCapacity = cluster_capacity[1] @@hmlog.info "Cluster CPU Capacity: #{@@clusterCpuCapacity} Memory Capacity: #{@@clusterMemoryCapacity}" - if @@cluster_health_model_enabled - ApplicationInsightsUtility.sendCustomEvent("in_kube_health Plugin Start", {}) - end + initialize_inventory end rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) @@ -76,10 +73,6 @@ def shutdown def enumerate begin - if !@@cluster_health_model_enabled - @@hmlog.info "Cluster Health Model disabled in in_kube_health" - return - end currentTime = Time.now emitTime = currentTime.to_f @@ -97,7 +90,8 @@ def enumerate @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory - @resources.deployment_inventory = deployment_inventory + @resources.set_deployment_inventory(deployment_inventory) + @resources.build_pod_uid_lookup if node_inventory_response.code.to_i != 200 record = process_kube_api_up_monitor("fail", node_inventory_response) @@ -117,12 +111,12 @@ def enumerate system_pods = pods_ready_hash.select{|k,v| v['namespace'] == 'kube-system'} workload_pods = pods_ready_hash.select{|k,v| v['namespace'] != 'kube-system'} - system_pods_ready_percentage_records = process_pods_ready_percentage(system_pods, HealthMonitorConstants::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) + system_pods_ready_percentage_records = process_pods_ready_percentage(system_pods, MonitorId::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) system_pods_ready_percentage_records.each do |record| health_monitor_records.push(record) if record end - workload_pods_ready_percentage_records = process_pods_ready_percentage(workload_pods, HealthMonitorConstants::USER_WORKLOAD_PODS_READY_MONITOR_ID) + workload_pods_ready_percentage_records = process_pods_ready_percentage(workload_pods, MonitorId::USER_WORKLOAD_PODS_READY_MONITOR_ID) workload_pods_ready_percentage_records.each do |record| health_monitor_records.push(record) if record end @@ -158,7 +152,7 @@ def process_cpu_oversubscribed_monitor(pod_inventory, node_inventory) state = subscription > @@clusterCpuCapacity ? "fail" : "pass" #CPU - monitor_id = HealthMonitorConstants::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID + monitor_id = MonitorId::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterCpuCapacity" => @@clusterCpuCapacity/1000000.to_f, "clusterCpuRequests" => subscription/1000000.to_f}} # @@hmlog.info health_monitor_record @@ -169,7 +163,7 @@ def process_cpu_oversubscribed_monitor(pod_inventory, node_inventory) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id #@@hmlog.info "Successfully processed process_cpu_oversubscribed_monitor" @@ -185,7 +179,7 @@ def process_memory_oversubscribed_monitor(pod_inventory, node_inventory) #@@hmlog.debug "Memory Oversubscribed Monitor State : #{state}" #CPU - monitor_id = HealthMonitorConstants::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID + monitor_id = MonitorId::WORKLOAD_MEMORY_OVERSUBSCRIBED_MONITOR_ID health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterMemoryCapacity" => @@clusterMemoryCapacity.to_f, "clusterMemoryRequests" => subscription.to_f}} hmlog = HealthMonitorUtils.get_log_handle @@ -195,7 +189,7 @@ def process_memory_oversubscribed_monitor(pod_inventory, node_inventory) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id #@@hmlog.info "Successfully processed process_memory_oversubscribed_monitor" @@ -205,21 +199,21 @@ def process_memory_oversubscribed_monitor(pod_inventory, node_inventory) def process_kube_api_up_monitor(state, response) timestamp = Time.now.utc.iso8601 - monitor_id = HealthMonitorConstants::KUBE_API_STATUS + monitor_id = MonitorId::KUBE_API_STATUS details = response.each_header.to_h details['ResponseCode'] = response.code health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} hmlog = HealthMonitorUtils.get_log_handle #hmlog.info health_monitor_record - monitor_instance_id = HealthMonitorConstants::KUBE_API_STATUS + monitor_instance_id = MonitorId::KUBE_API_STATUS #hmlog.info "Monitor Instance Id: #{monitor_instance_id}" health_record = {} time_now = Time.now.utc.iso8601 health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id #@@hmlog.info "Successfully processed process_kube_api_up_monitor" @@ -240,15 +234,15 @@ def process_pods_ready_percentage(pods_hash, config_monitor_id) percent = pods_ready / total_pods * 100 timestamp = Time.now.utc.iso8601 - state = HealthMonitorUtils.compute_percentage_state((100-percent), monitor_config) - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"totalPods" => total_pods, "podsReady" => pods_ready, "workloadName" => workload_name, "namespace" => namespace, "workloadKind" => workload_kind}} + state = HealthMonitorUtils.compute_percentage_state(percent, monitor_config) + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"totalPods" => total_pods, "podsReady" => pods_ready, "workload_name" => workload_name, "namespace" => namespace, "workload_kind" => workload_kind}} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(config_monitor_id, [@@cluster_id, namespace, workload_name]) health_record = {} time_now = Time.now.utc.iso8601 health_record[HealthMonitorRecordFields::MONITOR_ID] = config_monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id records.push(health_record) @@ -258,7 +252,7 @@ def process_pods_ready_percentage(pods_hash, config_monitor_id) end def process_node_condition_monitor(node_inventory) - monitor_id = HealthMonitorConstants::NODE_CONDITION_MONITOR_ID + monitor_id = MonitorId::NODE_CONDITION_MONITOR_ID timestamp = Time.now.utc.iso8601 monitor_config = @provider.get_config(monitor_id) node_condition_monitor_records = [] @@ -266,11 +260,12 @@ def process_node_condition_monitor(node_inventory) node_inventory['items'].each do |node| node_name = node['metadata']['name'] conditions = node['status']['conditions'] - state = HealthMonitorUtils.get_node_state_from_node_conditions(conditions) - #hmlog.debug "Node Name = #{node_name} State = #{state}" + state = HealthMonitorUtils.get_node_state_from_node_conditions(monitor_config, conditions) details = {} conditions.each do |condition| - details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message']} + state = !(condition['status'].downcase == 'true' && condition['type'].downcase != 'ready') ? HealthMonitorStates::PASS : HealthMonitorStates::FAIL + details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message'], "State" => state} + #@@hmlog.info "Node Condition details: #{JSON.pretty_generate(details)}" end health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id, node_name]) @@ -279,7 +274,7 @@ def process_node_condition_monitor(node_inventory) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::AGENT_COLLECTION_TIME] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id health_record[HealthMonitorRecordFields::NODE_NAME] = node_name @@ -290,6 +285,20 @@ def process_node_condition_monitor(node_inventory) return node_condition_monitor_records end + def initialize_inventory + #this is required because there are other components, like the container cpu memory aggregator, that depends on the mapping being initialized + node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes") + node_inventory = JSON.parse(node_inventory_response.body) + pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") + pod_inventory = JSON.parse(pod_inventory_response.body) + deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body) + + @resources.node_inventory = node_inventory + @resources.pod_inventory = pod_inventory + @resources.set_deployment_inventory(deployment_inventory) + @resources.build_pod_uid_lookup + end + def run_periodic @mutex.lock done = @finished diff --git a/source/code/plugin/out_health_forward.rb b/source/code/plugin/out_health_forward.rb new file mode 100644 index 000000000..18664a22a --- /dev/null +++ b/source/code/plugin/out_health_forward.rb @@ -0,0 +1,677 @@ +# +# Fluentd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +require 'base64' +require 'socket' +require 'fileutils' + +require 'cool.io' + +require 'fluent/output' +require 'fluent/config/error' + +module Fluent + class ForwardOutputError < StandardError + end + + class ForwardOutputResponseError < ForwardOutputError + end + + class ForwardOutputConnectionClosedError < ForwardOutputError + end + + class ForwardOutputACKTimeoutError < ForwardOutputResponseError + end + + class HealthForwardOutput < ObjectBufferedOutput + Plugin.register_output('health_forward', self) + + def initialize + super + require 'fluent/plugin/socket_util' + @nodes = [] #=> [Node] + end + + desc 'The timeout time when sending event logs.' + config_param :send_timeout, :time, default: 60 + desc 'The transport protocol to use for heartbeats.(udp,tcp,none)' + config_param :heartbeat_type, default: :udp do |val| + case val.downcase + when 'tcp' + :tcp + when 'udp' + :udp + when 'none' + :none + else + raise ConfigError, "forward output heartbeat type should be 'tcp', 'udp', or 'none'" + end + end + desc 'The interval of the heartbeat packer.' + config_param :heartbeat_interval, :time, default: 1 + desc 'The wait time before accepting a server fault recovery.' + config_param :recover_wait, :time, default: 10 + desc 'The hard timeout used to detect server failure.' + config_param :hard_timeout, :time, default: 60 + desc 'Set TTL to expire DNS cache in seconds.' + config_param :expire_dns_cache, :time, default: nil # 0 means disable cache + desc 'The threshold parameter used to detect server faults.' + config_param :phi_threshold, :integer, default: 16 + desc 'Use the "Phi accrual failure detector" to detect server failure.' + config_param :phi_failure_detector, :bool, default: true + + # if any options added that requires extended forward api, fix @extend_internal_protocol + + desc 'Change the protocol to at-least-once.' + config_param :require_ack_response, :bool, default: false # require in_forward to respond with ack + desc 'This option is used when require_ack_response is true.' + config_param :ack_response_timeout, :time, default: 190 # 0 means do not wait for ack responses + # Linux default tcp_syn_retries is 5 (in many environment) + # 3 + 6 + 12 + 24 + 48 + 96 -> 189 (sec) + desc 'Enable client-side DNS round robin.' + config_param :dns_round_robin, :bool, default: false # heartbeat_type 'udp' is not available for this + + attr_reader :nodes + + config_param :port, :integer, default: DEFAULT_LISTEN_PORT, deprecated: "User host xxx instead." + config_param :host, :string, default: nil, deprecated: "Use port xxx instead." + desc 'Skip network related error, e.g. DNS error, during plugin setup' + config_param :skip_network_error_at_init, :bool, :default => false + + + attr_accessor :extend_internal_protocol + + def configure(conf) + super + + # backward compatibility + if host = conf['host'] + port = conf['port'] + port = port ? port.to_i : DEFAULT_LISTEN_PORT + e = conf.add_element('server') + e['host'] = host + e['port'] = port.to_s + end + + recover_sample_size = @recover_wait / @heartbeat_interval + + # add options here if any options addes which uses extended protocol + @extend_internal_protocol = if @require_ack_response + true + else + false + end + + if @dns_round_robin + if @heartbeat_type == :udp + raise ConfigError, "forward output heartbeat type must be 'tcp' or 'none' to use dns_round_robin option" + end + end + + conf.elements.each {|e| + next if e.name != "server" + + host = e['host'] + port = e['port'] + port = port ? port.to_i : DEFAULT_LISTEN_PORT + + weight = e['weight'] + weight = weight ? weight.to_i : 60 + + standby = !!e['standby'] + + name = e['name'] + unless name + name = "#{host}:#{port}" + end + + failure = FailureDetector.new(@heartbeat_interval, @hard_timeout, Time.now.to_i.to_f) + + node_conf = NodeConfig2.new(name, host, port, weight, standby, failure, + @phi_threshold, recover_sample_size, @expire_dns_cache, @phi_failure_detector, @dns_round_robin, @skip_network_error_at_init) + + if @heartbeat_type == :none + @nodes << NoneHeartbeatNode.new(log, node_conf) + else + @nodes << Node.new(log, node_conf) + end + log.info "adding forwarding server '#{name}'", host: host, port: port, weight: weight, plugin_id: plugin_id + } + + if @nodes.empty? + raise ConfigError, "forward output plugin requires at least one is required" + end + end + + def start + super + + @rand_seed = Random.new.seed + rebuild_weight_array + @rr = 0 + + unless @heartbeat_type == :none + @loop = Coolio::Loop.new + + if @heartbeat_type == :udp + # assuming all hosts use udp + @usock = SocketUtil.create_udp_socket(@nodes.first.host) + @usock.fcntl(Fcntl::F_SETFL, Fcntl::O_NONBLOCK) + @hb = HeartbeatHandler.new(@usock, method(:on_heartbeat)) + @loop.attach(@hb) + end + + @timer = HeartbeatRequestTimer.new(@heartbeat_interval, method(:on_timer)) + @loop.attach(@timer) + + @thread = Thread.new(&method(:run)) + end + end + + def shutdown + @finished = true + if @loop + @loop.watchers.each {|w| w.detach } + @loop.stop + end + @thread.join if @thread + @usock.close if @usock + end + + def run + @loop.run if @loop + rescue + log.error "unexpected error", error: $!.to_s + log.error_backtrace + end + + def write_objects(tag, chunk) + return if chunk.empty? + + error = nil + + wlen = @weight_array.length + wlen.times do + @rr = (@rr + 1) % wlen + node = @weight_array[@rr] + + if node.available? + begin + send_data(node, tag, chunk) + return + rescue + # for load balancing during detecting crashed servers + error = $! # use the latest error + end + end + end + + if error + raise error + else + raise "no nodes are available" # TODO message + end + end + + private + + def rebuild_weight_array + standby_nodes, regular_nodes = @nodes.partition {|n| + n.standby? + } + + lost_weight = 0 + regular_nodes.each {|n| + unless n.available? + lost_weight += n.weight + end + } + log.debug "rebuilding weight array", lost_weight: lost_weight + + if lost_weight > 0 + standby_nodes.each {|n| + if n.available? + regular_nodes << n + log.warn "using standby node #{n.host}:#{n.port}", weight: n.weight + lost_weight -= n.weight + break if lost_weight <= 0 + end + } + end + + weight_array = [] + gcd = regular_nodes.map {|n| n.weight }.inject(0) {|r,w| r.gcd(w) } + regular_nodes.each {|n| + (n.weight / gcd).times { + weight_array << n + } + } + + # for load balancing during detecting crashed servers + coe = (regular_nodes.size * 6) / weight_array.size + weight_array *= coe if coe > 1 + + r = Random.new(@rand_seed) + weight_array.sort_by! { r.rand } + + @weight_array = weight_array + end + + # MessagePack FixArray length = 3 (if @extend_internal_protocol) + # = 2 (else) + FORWARD_HEADER = [0x92].pack('C').freeze + FORWARD_HEADER_EXT = [0x93].pack('C').freeze + def forward_header + if @extend_internal_protocol + FORWARD_HEADER_EXT + else + FORWARD_HEADER + end + end + + #FORWARD_TCP_HEARTBEAT_DATA = FORWARD_HEADER + ''.to_msgpack + [].to_msgpack + def send_heartbeat_tcp(node) + sock = connect(node) + begin + opt = [1, @send_timeout.to_i].pack('I!I!') # { int l_onoff; int l_linger; } + sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_LINGER, opt) + opt = [@send_timeout.to_i, 0].pack('L!L!') # struct timeval + # don't send any data to not cause a compatibility problem + #sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, opt) + #sock.write FORWARD_TCP_HEARTBEAT_DATA + node.heartbeat(true) + ensure + sock.close + end + end + + def send_data(node, tag, chunk) + sock = connect(node) + begin + opt = [1, @send_timeout.to_i].pack('I!I!') # { int l_onoff; int l_linger; } + sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_LINGER, opt) + + opt = [@send_timeout.to_i, 0].pack('L!L!') # struct timeval + sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, opt) + + # beginArray(2) + sock.write forward_header + + # writeRaw(tag) + sock.write tag.to_msgpack # tag + + # beginRaw(size) + sz = chunk.size + #if sz < 32 + # # FixRaw + # sock.write [0xa0 | sz].pack('C') + #elsif sz < 65536 + # # raw 16 + # sock.write [0xda, sz].pack('Cn') + #else + # raw 32 + sock.write [0xdb, sz].pack('CN') + #end + + # writeRawBody(packed_es) + chunk.write_to(sock) + + if @extend_internal_protocol + option = {} + option['chunk'] = Base64.encode64(chunk.unique_id) if @require_ack_response + sock.write option.to_msgpack + + if @require_ack_response && @ack_response_timeout > 0 + # Waiting for a response here results in a decrease of throughput because a chunk queue is locked. + # To avoid a decrease of troughput, it is necessary to prepare a list of chunks that wait for responses + # and process them asynchronously. + if IO.select([sock], nil, nil, @ack_response_timeout) + raw_data = sock.recv(1024) + + # When connection is closed by remote host, socket is ready to read and #recv returns an empty string that means EOF. + # If this happens we assume the data wasn't delivered and retry it. + if raw_data.empty? + @log.warn "node #{node.host}:#{node.port} closed the connection. regard it as unavailable." + node.disable! + raise ForwardOutputConnectionClosedError, "node #{node.host}:#{node.port} closed connection" + else + # Serialization type of the response is same as sent data. + res = MessagePack.unpack(raw_data) + + if res['ack'] != option['chunk'] + # Some errors may have occured when ack and chunk id is different, so send the chunk again. + raise ForwardOutputResponseError, "ack in response and chunk id in sent data are different" + end + end + + else + # IO.select returns nil on timeout. + # There are 2 types of cases when no response has been received: + # (1) the node does not support sending responses + # (2) the node does support sending response but responses have not arrived for some reasons. + @log.warn "no response from #{node.host}:#{node.port}. regard it as unavailable." + node.disable! + raise ForwardOutputACKTimeoutError, "node #{node.host}:#{node.port} does not return ACK" + end + end + end + + node.heartbeat(false) + return res # for test + ensure + sock.close + end + end + + def connect(node) + # TODO unix socket? + TCPSocket.new(node.resolved_host, node.port) + end + + class HeartbeatRequestTimer < Coolio::TimerWatcher + def initialize(interval, callback) + super(interval, true) + @callback = callback + end + + def on_timer + @callback.call + rescue + # TODO log? + end + end + + def on_timer + return if @finished + @nodes.each {|n| + if n.tick + rebuild_weight_array + end + begin + #log.trace "sending heartbeat #{n.host}:#{n.port} on #{@heartbeat_type}" + if @heartbeat_type == :tcp + send_heartbeat_tcp(n) + else + @usock.send "\0", 0, Socket.pack_sockaddr_in(n.port, n.resolved_host) + end + rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR, Errno::ECONNREFUSED + # TODO log + log.debug "failed to send heartbeat packet to #{n.host}:#{n.port}", error: $!.to_s + end + } + end + + class HeartbeatHandler < Coolio::IO + def initialize(io, callback) + super(io) + @io = io + @callback = callback + end + + def on_readable + begin + msg, addr = @io.recvfrom(1024) + rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR + return + end + host = addr[3] + port = addr[1] + sockaddr = Socket.pack_sockaddr_in(port, host) + @callback.call(sockaddr, msg) + rescue + # TODO log? + end + end + + def on_heartbeat(sockaddr, msg) + port, host = Socket.unpack_sockaddr_in(sockaddr) + if node = @nodes.find {|n| n.sockaddr == sockaddr } + #log.trace "heartbeat from '#{node.name}'", :host=>node.host, :port=>node.port + if node.heartbeat + rebuild_weight_array + end + end + end + + NodeConfig2 = Struct.new("NodeConfig2", :name, :host, :port, :weight, :standby, :failure, + :phi_threshold, :recover_sample_size, :expire_dns_cache, :phi_failure_detector, :dns_round_robin, :skip_network_error) + + class Node + def initialize(log, conf) + @log = log + @conf = conf + @name = @conf.name + @host = @conf.host + @port = @conf.port + @weight = @conf.weight + @failure = @conf.failure + @available = true + + @resolved_host = nil + @resolved_time = 0 + begin + resolved_host # check dns + rescue => e + if @conf.skip_network_error + log.warn "#{@name} got network error during setup. Resolve host later", :error => e, :error_class => e.class + else + raise + end + end + end + + attr_reader :conf + attr_reader :name, :host, :port, :weight + attr_reader :sockaddr # used by on_heartbeat + attr_reader :failure, :available # for test + + def available? + @available + end + + def disable! + @available = false + end + + def standby? + @conf.standby + end + + def resolved_host + case @conf.expire_dns_cache + when 0 + # cache is disabled + return resolve_dns! + + when nil + # persistent cache + return @resolved_host ||= resolve_dns! + + else + now = Engine.now + rh = @resolved_host + if !rh || now - @resolved_time >= @conf.expire_dns_cache + rh = @resolved_host = resolve_dns! + @resolved_time = now + end + return rh + end + end + + def resolve_dns! + addrinfo_list = Socket.getaddrinfo(@host, @port, nil, Socket::SOCK_STREAM) + addrinfo = @conf.dns_round_robin ? addrinfo_list.sample : addrinfo_list.first + @sockaddr = Socket.pack_sockaddr_in(addrinfo[1], addrinfo[3]) # used by on_heartbeat + addrinfo[3] + end + private :resolve_dns! + + def tick + now = Time.now.to_f + if !@available + if @failure.hard_timeout?(now) + @failure.clear + end + return nil + end + + if @failure.hard_timeout?(now) + @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, hard_timeout: true + @available = false + @resolved_host = nil # expire cached host + @failure.clear + return true + end + + if @conf.phi_failure_detector + phi = @failure.phi(now) + #$log.trace "phi '#{@name}'", :host=>@host, :port=>@port, :phi=>phi + if phi > @conf.phi_threshold + @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, phi: phi + @available = false + @resolved_host = nil # expire cached host + @failure.clear + return true + end + end + return false + end + + def heartbeat(detect=true) + now = Time.now.to_f + @failure.add(now) + #@log.trace "heartbeat from '#{@name}'", :host=>@host, :port=>@port, :available=>@available, :sample_size=>@failure.sample_size + if detect && !@available && @failure.sample_size > @conf.recover_sample_size + @available = true + @log.warn "recovered forwarding server '#{@name}'", host: @host, port: @port + return true + else + return nil + end + end + + def to_msgpack(out = '') + [@host, @port, @weight, @available].to_msgpack(out) + end + end + + # Override Node to disable heartbeat + class NoneHeartbeatNode < Node + def available? + true + end + + def tick + false + end + + def heartbeat(detect=true) + true + end + end + + class FailureDetector + PHI_FACTOR = 1.0 / Math.log(10.0) + SAMPLE_SIZE = 1000 + + def initialize(heartbeat_interval, hard_timeout, init_last) + @heartbeat_interval = heartbeat_interval + @last = init_last + @hard_timeout = hard_timeout + + # microsec + @init_gap = (heartbeat_interval * 1e6).to_i + @window = [@init_gap] + end + + def hard_timeout?(now) + now - @last > @hard_timeout + end + + def add(now) + if @window.empty? + @window << @init_gap + @last = now + else + gap = now - @last + @window << (gap * 1e6).to_i + @window.shift if @window.length > SAMPLE_SIZE + @last = now + end + end + + def phi(now) + size = @window.size + return 0.0 if size == 0 + + # Calculate weighted moving average + mean_usec = 0 + fact = 0 + @window.each_with_index {|gap,i| + mean_usec += gap * (1+i) + fact += (1+i) + } + mean_usec = mean_usec / fact + + # Normalize arrive intervals into 1sec + mean = (mean_usec.to_f / 1e6) - @heartbeat_interval + 1 + + # Calculate phi of the phi accrual failure detector + t = now - @last - @heartbeat_interval + 1 + phi = PHI_FACTOR * t / mean + + return phi + end + + def sample_size + @window.size + end + + def clear + @window.clear + @last = 0 + end + end + + ## TODO + #class RPC + # def initialize(this) + # @this = this + # end + # + # def list_nodes + # @this.nodes + # end + # + # def list_fault_nodes + # list_nodes.select {|n| !n.available? } + # end + # + # def list_available_nodes + # list_nodes.select {|n| n.available? } + # end + # + # def add_node(name, host, port, weight) + # end + # + # def recover_node(host, port) + # end + # + # def remove_node(host, port) + # end + #end + end +end diff --git a/test/code/plugin/health/cadvisor_perf.json b/test/code/plugin/health/cadvisor_perf.json new file mode 100644 index 000000000..35eae32b6 --- /dev/null +++ b/test/code/plugin/health/cadvisor_perf.json @@ -0,0 +1,2540 @@ +[ + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:39Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 14061568 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:44Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 7249920 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:45Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 14442496 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:49Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 5988352 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:43Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/f65e6a62-c5c8-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 40284160 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:41Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/69e68b21-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 101965824 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 3203072 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:42Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 9658368 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:42Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9543dbb7-a1f2-11e9-8b08-d602e29755d5/metrics-server", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 21491712 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639906 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639899 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639895 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639903 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/f65e6a62-c5c8-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566580259 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/69e68b21-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566589936 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1563224142 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1563224144 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9543dbb7-a1f2-11e9-8b08-d602e29755d5/metrics-server", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639893 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:39Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 349987 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:44Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 773186 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:45Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 2718196 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:49Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 2007695 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:43Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/f65e6a62-c5c8-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 674463 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:41Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/69e68b21-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 2159553 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 3575667 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:42Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 0 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:42Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9543dbb7-a1f2-11e9-8b08-d602e29755d5/metrics-server", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 633968 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:39Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 11546624 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:39Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 11546624 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:44Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 5652480 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:45Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 10981376 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:49Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 2875392 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:43Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/f65e6a62-c5c8-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 20627456 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:41Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/69e68b21-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 69353472 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 462848 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:42Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/e690309f-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 8212480 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:42Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9543dbb7-a1f2-11e9-8b08-d602e29755d5/metrics-server", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 16543744 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:45Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-1", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 814518272 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:45Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-1", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 82091339.40983607 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:45Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-1", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 2089115648 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:50Z", + "Host": "aks-nodepool1-19574989-1", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-1", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1552408751.22 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:56Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b1e04e1c-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 85528576 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:54Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/49e373c8-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 25415680 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:53Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/65a6f978-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 111738880 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:55Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster-nanny", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 8417280 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:01Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 19492864 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9583b2ab-a1f2-11e9-8b08-d602e29755d5/main", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 12918784 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:46Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 3379200 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 9818112 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b1e04e1c-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566590024 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/49e373c8-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566580398 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/65a6f978-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566589942 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster-nanny", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566580342 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566580337 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9583b2ab-a1f2-11e9-8b08-d602e29755d5/main", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639936 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1563224072 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1563224077 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:56Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b1e04e1c-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 4447595 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:54Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/49e373c8-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 2765529 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:53Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/65a6f978-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 5565414 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:55Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster-nanny", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 863810 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:01Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 886196 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9583b2ab-a1f2-11e9-8b08-d602e29755d5/main", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 855014 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:46Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 1794634 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 0 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:56Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b1e04e1c-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 76308480 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:54Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/49e373c8-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 21319680 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:53Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/65a6f978-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 78180352 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:55Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster-nanny", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 7909376 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:01Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/24ab7e32-c5c9-11e9-8736-86290fd7dd1f/heapster", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 18968576 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/9583b2ab-a1f2-11e9-8b08-d602e29755d5/main", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 9871360 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:46Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 462848 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/bb3d3ef2-a742-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 8212480 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-0", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 865943552 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-0", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 95432166.25 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:12:57Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-0", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 2191216640 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:03Z", + "Host": "aks-nodepool1-19574989-0", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-0", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1552408749.66 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:07Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b2a0e1b3-bd3f-11e9-b2a7-d61658c73830/tunnel-front", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 17743872 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:12Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/114f7246-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 24162304 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:07Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/azureproxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 11472896 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:06Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/redirector", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 3821568 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/8dbd5e8b-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 92057600 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b2a0e1b3-bd3f-11e9-b2a7-d61658c73830/tunnel-front", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1565641691 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/114f7246-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566580300 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/azureproxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1565204288 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/redirector", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1565204284 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/8dbd5e8b-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566589995 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:07Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b2a0e1b3-bd3f-11e9-b2a7-d61658c73830/tunnel-front", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 35140951 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:12Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/114f7246-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 983407 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:07Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/azureproxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 0 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:06Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/redirector", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 4221562 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/8dbd5e8b-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 1881274 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:07Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b2a0e1b3-bd3f-11e9-b2a7-d61658c73830/tunnel-front", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 4161536 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:12Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/114f7246-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 18952192 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:07Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/azureproxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 8224768 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:06Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/44a61692-b945-11e9-a1b6-127094e7fd94/redirector", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 483328 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/8dbd5e8b-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 74915840 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:14Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-3", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 554704896 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:14Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-3", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 88981130.86666666 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:14Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-3", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 1633976320 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:14:15Z", + "Host": "aks-nodepool1-19574989-3", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-3", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1565204130.6 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/be78d7f6-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 92954624 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:33Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 7446528 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:22Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 14811136 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 15114240 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:35Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 5406720 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:32Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/94e52ab1-a1f2-11e9-8b08-d602e29755d5/autoscaler", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 10043392 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/2c3de48d-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 58052608 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 9904128 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 3645440 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/be78d7f6-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566590079 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639920 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639940 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639904 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639932 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/94e52ab1-a1f2-11e9-8b08-d602e29755d5/autoscaler", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1562639909 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/2c3de48d-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1566580349 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1563224204 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1563224199 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/be78d7f6-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 3004849 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:33Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 796842 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:22Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 708906 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 3451625 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:35Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 2572419 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:32Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/94e52ab1-a1f2-11e9-8b08-d602e29755d5/autoscaler", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 548275 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/2c3de48d-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 1740316 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 0 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 3156661 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/be78d7f6-c5df-11e9-8736-86290fd7dd1f/omsagent", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 66428928 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:33Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/dnsmasq", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 5611520 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:22Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/sidecar", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 11833344 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/kubedns", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 11063296 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:35Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/95046bc6-a1f2-11e9-8b08-d602e29755d5/healthz", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 2551808 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:32Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/94e52ab1-a1f2-11e9-8b08-d602e29755d5/autoscaler", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 9244672 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:37Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/2c3de48d-c5c9-11e9-8736-86290fd7dd1f/kube-proxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 20402176 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/azureproxy", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 8216576 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:31Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/06fef5f6-a743-11e9-a38a-22d1c75c4357/redirector", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 462848 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:30Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-2", + "Collections": [ + { + "CounterName": "memoryRssBytes", + "Value": 853344256 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:30Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-2", + "Collections": [ + { + "CounterName": "cpuUsageNanoCores", + "Value": 114265842.16 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:30Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-2", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 1892982784 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }, + { + "DataItems": [ + { + "Timestamp": "2019-08-23T22:13:40Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SNode", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/aks-nodepool1-19574989-2", + "Collections": [ + { + "CounterName": "restartTimeEpoch", + "Value": 1561082409.36 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + } +] \ No newline at end of file diff --git a/test/code/plugin/health/deployments.json b/test/code/plugin/health/deployments.json new file mode 100644 index 000000000..75586db04 --- /dev/null +++ b/test/code/plugin/health/deployments.json @@ -0,0 +1,1385 @@ +{ + "apiVersion": "v1", + "items": [ + { + "apiVersion": "extensions/v1beta1", + "kind": "Deployment", + "metadata": { + "annotations": { + "deployment.kubernetes.io/revision": "2" + }, + "creationTimestamp": "2019-08-23T17:12:00Z", + "generation": 2, + "labels": { + "addonmanager.kubernetes.io/mode": "EnsureExists", + "k8s-app": "heapster", + "kubernetes.io/cluster-service": "true" + }, + "name": "heapster", + "namespace": "kube-system", + "resourceVersion": "19048928", + "selfLink": "/apis/extensions/v1beta1/namespaces/kube-system/deployments/heapster", + "uid": "1e98c3d1-c5c9-11e9-8736-86290fd7dd1f" + }, + "spec": { + "progressDeadlineSeconds": 2147483647, + "replicas": 1, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "k8s-app": "heapster" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 1 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "creationTimestamp": null, + "labels": { + "k8s-app": "heapster" + } + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/heapster", + "--source=kubernetes.summary_api:\"\"" + ], + "image": "aksrepos.azurecr.io/mirror/heapster-amd64:v1.5.3", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/healthz", + "port": 8082, + "scheme": "HTTP" + }, + "initialDelaySeconds": 180, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "heapster", + "resources": { + "limits": { + "cpu": "88m", + "memory": "204Mi" + }, + "requests": { + "cpu": "88m", + "memory": "204Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File" + }, + { + "command": [ + "/pod_nanny", + "--config-dir=/etc/config", + "--cpu=80m", + "--extra-cpu=0.5m", + "--memory=140Mi", + "--extra-memory=4Mi", + "--threshold=5", + "--deployment=heapster", + "--container=heapster", + "--poll-period=300000", + "--estimator=exponential" + ], + "env": [ + { + "name": "MY_POD_NAME", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "metadata.name" + } + } + }, + { + "name": "MY_POD_NAMESPACE", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "metadata.namespace" + } + } + } + ], + "image": "aksrepos.azurecr.io/mirror/addon-resizer:1.8.1", + "imagePullPolicy": "IfNotPresent", + "name": "heapster-nanny", + "resources": { + "limits": { + "cpu": "50m", + "memory": "90Mi" + }, + "requests": { + "cpu": "50m", + "memory": "90Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/config", + "name": "heapster-config-volume" + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "heapster", + "serviceAccountName": "heapster", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "heapster-config" + }, + "name": "heapster-config-volume" + } + ] + } + } + }, + "status": { + "availableReplicas": 1, + "conditions": [ + { + "lastTransitionTime": "2019-08-23T17:12:00Z", + "lastUpdateTime": "2019-08-23T17:12:00Z", + "message": "Deployment has minimum availability.", + "reason": "MinimumReplicasAvailable", + "status": "True", + "type": "Available" + } + ], + "observedGeneration": 2, + "readyReplicas": 1, + "replicas": 1, + "updatedReplicas": 1 + } + }, + { + "apiVersion": "extensions/v1beta1", + "kind": "Deployment", + "metadata": { + "annotations": { + "deployment.kubernetes.io/revision": "5", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"extensions/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"addonmanager.kubernetes.io/mode\":\"Reconcile\",\"k8s-app\":\"kube-dns-autoscaler\",\"kubernetes.io/cluster-service\":\"true\"},\"name\":\"kube-dns-autoscaler\",\"namespace\":\"kube-system\"},\"spec\":{\"selector\":{\"matchLabels\":{\"k8s-app\":\"kube-dns-autoscaler\"}},\"template\":{\"metadata\":{\"annotations\":{\"scheduler.alpha.kubernetes.io/critical-pod\":\"\",\"seccomp.security.alpha.kubernetes.io/pod\":\"docker/default\"},\"labels\":{\"k8s-app\":\"kube-dns-autoscaler\"}},\"spec\":{\"containers\":[{\"command\":[\"/cluster-proportional-autoscaler\",\"--namespace=kube-system\",\"--configmap=kube-dns-autoscaler\",\"--target=deployment/kube-dns-v20\",\"--default-params={\\\"ladder\\\":{\\\"coresToReplicas\\\":[[1,2],[512,3],[1024,4],[2048,5]],\\\"nodesToReplicas\\\":[[1,2],[8,3],[16,4],[32,5]]}}\",\"--logtostderr=true\",\"--v=2\"],\"image\":\"aksrepos.azurecr.io/mirror/cluster-proportional-autoscaler-amd64:1.1.2-r2\",\"name\":\"autoscaler\",\"resources\":{\"requests\":{\"cpu\":\"20m\",\"memory\":\"10Mi\"}}}],\"dnsPolicy\":\"Default\",\"imagePullSecrets\":[{\"name\":\"emptyacrsecret\"}],\"priorityClassName\":\"system-node-critical\",\"serviceAccountName\":\"kube-dns-autoscaler\",\"tolerations\":[{\"key\":\"CriticalAddonsOnly\",\"operator\":\"Exists\"}]}}}}\n" + }, + "creationTimestamp": "2019-03-12T16:38:30Z", + "generation": 5, + "labels": { + "addonmanager.kubernetes.io/mode": "Reconcile", + "k8s-app": "kube-dns-autoscaler", + "kubernetes.io/cluster-service": "true" + }, + "name": "kube-dns-autoscaler", + "namespace": "kube-system", + "resourceVersion": "15144046", + "selfLink": "/apis/extensions/v1beta1/namespaces/kube-system/deployments/kube-dns-autoscaler", + "uid": "4509acaf-44e5-11e9-9920-423525a6b683" + }, + "spec": { + "progressDeadlineSeconds": 2147483647, + "replicas": 1, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "k8s-app": "kube-dns-autoscaler" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 1 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "annotations": { + "scheduler.alpha.kubernetes.io/critical-pod": "", + "seccomp.security.alpha.kubernetes.io/pod": "docker/default" + }, + "creationTimestamp": null, + "labels": { + "k8s-app": "kube-dns-autoscaler" + } + }, + "spec": { + "containers": [ + { + "command": [ + "/cluster-proportional-autoscaler", + "--namespace=kube-system", + "--configmap=kube-dns-autoscaler", + "--target=deployment/kube-dns-v20", + "--default-params={\"ladder\":{\"coresToReplicas\":[[1,2],[512,3],[1024,4],[2048,5]],\"nodesToReplicas\":[[1,2],[8,3],[16,4],[32,5]]}}", + "--logtostderr=true", + "--v=2" + ], + "image": "aksrepos.azurecr.io/mirror/cluster-proportional-autoscaler-amd64:1.1.2-r2", + "imagePullPolicy": "IfNotPresent", + "name": "autoscaler", + "resources": { + "requests": { + "cpu": "20m", + "memory": "10Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File" + } + ], + "dnsPolicy": "Default", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-dns-autoscaler", + "serviceAccountName": "kube-dns-autoscaler", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + } + ] + } + } + }, + "status": { + "availableReplicas": 1, + "conditions": [ + { + "lastTransitionTime": "2019-03-12T16:38:30Z", + "lastUpdateTime": "2019-03-12T16:38:30Z", + "message": "Deployment has minimum availability.", + "reason": "MinimumReplicasAvailable", + "status": "True", + "type": "Available" + } + ], + "observedGeneration": 5, + "readyReplicas": 1, + "replicas": 1, + "updatedReplicas": 1 + } + }, + { + "apiVersion": "extensions/v1beta1", + "kind": "Deployment", + "metadata": { + "annotations": { + "deployment.kubernetes.io/revision": "6", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"extensions/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"addonmanager.kubernetes.io/mode\":\"Reconcile\",\"k8s-app\":\"kube-dns\",\"kubernetes.io/cluster-service\":\"true\",\"version\":\"v20\"},\"name\":\"kube-dns-v20\",\"namespace\":\"kube-system\"},\"spec\":{\"selector\":{\"matchLabels\":{\"k8s-app\":\"kube-dns\",\"version\":\"v20\"}},\"template\":{\"metadata\":{\"annotations\":{\"prometheus.io/port\":\"10055\",\"prometheus.io/scrape\":\"true\"},\"labels\":{\"k8s-app\":\"kube-dns\",\"kubernetes.io/cluster-service\":\"true\",\"version\":\"v20\"}},\"spec\":{\"affinity\":{\"nodeAffinity\":{\"requiredDuringSchedulingIgnoredDuringExecution\":{\"nodeSelectorTerms\":[{\"labelSelector\":null,\"matchExpressions\":[{\"key\":\"kubernetes.azure.com/cluster\",\"operator\":\"Exists\"}]}]}},\"podAntiAffinity\":{\"preferredDuringSchedulingIgnoredDuringExecution\":[{\"podAffinityTerm\":{\"labelSelector\":{\"matchExpressions\":[{\"key\":\"k8s-app\",\"operator\":\"In\",\"values\":[\"kube-dns\"]}]},\"topologyKey\":\"kubernetes.io/hostname\"},\"weight\":100}]}},\"containers\":[{\"args\":[\"--kubecfg-file=/config/kubeconfig\",\"--config-dir=/kube-dns-config\",\"--domain=cluster.local.\",\"--dns-port=10053\",\"--v=2\"],\"env\":[{\"name\":\"PROMETHEUS_PORT\",\"value\":\"10055\"}],\"image\":\"aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64:1.14.13\",\"livenessProbe\":{\"failureThreshold\":5,\"httpGet\":{\"path\":\"/healthcheck/kubedns\",\"port\":10054,\"scheme\":\"HTTP\"},\"initialDelaySeconds\":60,\"successThreshold\":1,\"timeoutSeconds\":5},\"name\":\"kubedns\",\"ports\":[{\"containerPort\":10053,\"name\":\"dns-local\",\"protocol\":\"UDP\"},{\"containerPort\":10053,\"name\":\"dns-tcp-local\",\"protocol\":\"TCP\"},{\"containerPort\":10055,\"name\":\"metrics\",\"protocol\":\"TCP\"}],\"readinessProbe\":{\"httpGet\":{\"path\":\"/readiness\",\"port\":8081,\"scheme\":\"HTTP\"},\"initialDelaySeconds\":30,\"timeoutSeconds\":5},\"resources\":{\"limits\":{\"memory\":\"170Mi\"},\"requests\":{\"cpu\":\"100m\",\"memory\":\"70Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/kube-dns-config\",\"name\":\"kube-dns-config\"},{\"mountPath\":\"/config\",\"name\":\"kubedns-kubecfg\",\"readOnly\":true}]},{\"args\":[\"-v=2\",\"-logtostderr\",\"-configDir=/kube-dns-config\",\"-restartDnsmasq=true\",\"--\",\"-k\",\"--cache-size=1000\",\"--no-negcache\",\"--no-resolv\",\"--server=127.0.0.1#10053\",\"--server=/cluster.local/127.0.0.1#10053\",\"--server=/in-addr.arpa/127.0.0.1#10053\",\"--server=/ip6.arpa/127.0.0.1#10053\",\"--log-facility=-\"],\"image\":\"aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64:1.14.10\",\"name\":\"dnsmasq\",\"ports\":[{\"containerPort\":53,\"name\":\"dns\",\"protocol\":\"UDP\"},{\"containerPort\":53,\"name\":\"dns-tcp\",\"protocol\":\"TCP\"}],\"volumeMounts\":[{\"mountPath\":\"/kube-dns-config\",\"name\":\"kube-dns-config\"}]},{\"args\":[\"--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1 \\u003e/dev/null || exit 1; done\",\"--url=/healthz-dnsmasq\",\"--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1:10053 \\u003e/dev/null || exit 1; done\",\"--url=/healthz-kubedns\",\"--port=8080\",\"--quiet\"],\"env\":[{\"name\":\"PROBE_DOMAINS\",\"value\":\"bing.com kubernetes.default.svc.cluster.local\"}],\"image\":\"aksrepos.azurecr.io/mirror/exechealthz-amd64:1.2\",\"livenessProbe\":{\"failureThreshold\":5,\"httpGet\":{\"path\":\"/healthz-dnsmasq\",\"port\":8080,\"scheme\":\"HTTP\"},\"initialDelaySeconds\":60,\"successThreshold\":1,\"timeoutSeconds\":5},\"name\":\"healthz\",\"ports\":[{\"containerPort\":8080,\"protocol\":\"TCP\"}],\"resources\":{\"limits\":{\"memory\":\"50Mi\"},\"requests\":{\"cpu\":\"10m\",\"memory\":\"50Mi\"}}},{\"args\":[\"--v=2\",\"--logtostderr\",\"--probe=kubedns,127.0.0.1:10053,kubernetes.default.svc.cluster.local,5,SRV\",\"--probe=dnsmasq,127.0.0.1:53,kubernetes.default.svc.cluster.local,5,SRV\"],\"image\":\"aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64:1.14.10\",\"livenessProbe\":{\"httpGet\":{\"path\":\"/metrics\",\"port\":10054,\"scheme\":\"HTTP\"},\"initialDelaySeconds\":60,\"successThreshold\":1,\"timeoutSeconds\":5},\"name\":\"sidecar\",\"ports\":[{\"containerPort\":10054,\"name\":\"metrics\",\"protocol\":\"TCP\"}],\"resources\":{\"requests\":{\"cpu\":\"10m\",\"memory\":\"20Mi\"}}}],\"dnsPolicy\":\"Default\",\"imagePullSecrets\":[{\"name\":\"emptyacrsecret\"}],\"nodeSelector\":{\"beta.kubernetes.io/os\":\"linux\"},\"priorityClassName\":\"system-node-critical\",\"serviceAccountName\":\"kube-dns\",\"tolerations\":[{\"key\":\"CriticalAddonsOnly\",\"operator\":\"Exists\"}],\"volumes\":[{\"configMap\":{\"name\":\"kube-dns\",\"optional\":true},\"name\":\"kube-dns-config\"},{\"configMap\":{\"name\":\"kubedns-kubecfg\"},\"name\":\"kubedns-kubecfg\"}]}}}}\n" + }, + "creationTimestamp": "2019-03-12T16:38:30Z", + "generation": 7, + "labels": { + "addonmanager.kubernetes.io/mode": "Reconcile", + "k8s-app": "kube-dns", + "kubernetes.io/cluster-service": "true", + "version": "v20" + }, + "name": "kube-dns-v20", + "namespace": "kube-system", + "resourceVersion": "15144054", + "selfLink": "/apis/extensions/v1beta1/namespaces/kube-system/deployments/kube-dns-v20", + "uid": "4523fcd7-44e5-11e9-9920-423525a6b683" + }, + "spec": { + "progressDeadlineSeconds": 2147483647, + "replicas": 2, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "k8s-app": "kube-dns", + "version": "v20" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 1 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "annotations": { + "prometheus.io/port": "10055", + "prometheus.io/scrape": "true" + }, + "creationTimestamp": null, + "labels": { + "k8s-app": "kube-dns", + "kubernetes.io/cluster-service": "true", + "version": "v20" + } + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + }, + "podAntiAffinity": { + "preferredDuringSchedulingIgnoredDuringExecution": [ + { + "podAffinityTerm": { + "labelSelector": { + "matchExpressions": [ + { + "key": "k8s-app", + "operator": "In", + "values": [ + "kube-dns" + ] + } + ] + }, + "topologyKey": "kubernetes.io/hostname" + }, + "weight": 100 + } + ] + } + }, + "containers": [ + { + "args": [ + "--kubecfg-file=/config/kubeconfig", + "--config-dir=/kube-dns-config", + "--domain=cluster.local.", + "--dns-port=10053", + "--v=2" + ], + "env": [ + { + "name": "PROMETHEUS_PORT", + "value": "10055" + } + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64:1.14.13", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 5, + "httpGet": { + "path": "/healthcheck/kubedns", + "port": 10054, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "kubedns", + "ports": [ + { + "containerPort": 10053, + "name": "dns-local", + "protocol": "UDP" + }, + { + "containerPort": 10053, + "name": "dns-tcp-local", + "protocol": "TCP" + }, + { + "containerPort": 10055, + "name": "metrics", + "protocol": "TCP" + } + ], + "readinessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/readiness", + "port": 8081, + "scheme": "HTTP" + }, + "initialDelaySeconds": 30, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "resources": { + "limits": { + "memory": "170Mi" + }, + "requests": { + "cpu": "100m", + "memory": "70Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/kube-dns-config", + "name": "kube-dns-config" + }, + { + "mountPath": "/config", + "name": "kubedns-kubecfg", + "readOnly": true + } + ] + }, + { + "args": [ + "-v=2", + "-logtostderr", + "-configDir=/kube-dns-config", + "-restartDnsmasq=true", + "--", + "-k", + "--cache-size=1000", + "--no-negcache", + "--no-resolv", + "--server=127.0.0.1#10053", + "--server=/cluster.local/127.0.0.1#10053", + "--server=/in-addr.arpa/127.0.0.1#10053", + "--server=/ip6.arpa/127.0.0.1#10053", + "--log-facility=-" + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64:1.14.10", + "imagePullPolicy": "IfNotPresent", + "name": "dnsmasq", + "ports": [ + { + "containerPort": 53, + "name": "dns", + "protocol": "UDP" + }, + { + "containerPort": 53, + "name": "dns-tcp", + "protocol": "TCP" + } + ], + "resources": {}, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/kube-dns-config", + "name": "kube-dns-config" + } + ] + }, + { + "args": [ + "--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1 \u003e/dev/null || exit 1; done", + "--url=/healthz-dnsmasq", + "--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1:10053 \u003e/dev/null || exit 1; done", + "--url=/healthz-kubedns", + "--port=8080", + "--quiet" + ], + "env": [ + { + "name": "PROBE_DOMAINS", + "value": "bing.com kubernetes.default.svc.cluster.local" + } + ], + "image": "aksrepos.azurecr.io/mirror/exechealthz-amd64:1.2", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 5, + "httpGet": { + "path": "/healthz-dnsmasq", + "port": 8080, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "healthz", + "ports": [ + { + "containerPort": 8080, + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "memory": "50Mi" + }, + "requests": { + "cpu": "10m", + "memory": "50Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File" + }, + { + "args": [ + "--v=2", + "--logtostderr", + "--probe=kubedns,127.0.0.1:10053,kubernetes.default.svc.cluster.local,5,SRV", + "--probe=dnsmasq,127.0.0.1:53,kubernetes.default.svc.cluster.local,5,SRV" + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64:1.14.10", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/metrics", + "port": 10054, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "sidecar", + "ports": [ + { + "containerPort": 10054, + "name": "metrics", + "protocol": "TCP" + } + ], + "resources": { + "requests": { + "cpu": "10m", + "memory": "20Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File" + } + ], + "dnsPolicy": "Default", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-dns", + "serviceAccountName": "kube-dns", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "kube-dns", + "optional": true + }, + "name": "kube-dns-config" + }, + { + "configMap": { + "defaultMode": 420, + "name": "kubedns-kubecfg" + }, + "name": "kubedns-kubecfg" + } + ] + } + } + }, + "status": { + "availableReplicas": 2, + "conditions": [ + { + "lastTransitionTime": "2019-07-23T14:46:03Z", + "lastUpdateTime": "2019-07-23T14:46:03Z", + "message": "Deployment has minimum availability.", + "reason": "MinimumReplicasAvailable", + "status": "True", + "type": "Available" + } + ], + "observedGeneration": 7, + "readyReplicas": 2, + "replicas": 2, + "updatedReplicas": 2 + } + }, + { + "apiVersion": "extensions/v1beta1", + "kind": "Deployment", + "metadata": { + "annotations": { + "deployment.kubernetes.io/revision": "6", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"extensions/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"addonmanager.kubernetes.io/mode\":\"Reconcile\",\"k8s-app\":\"kubernetes-dashboard\",\"kubernetes.io/cluster-service\":\"true\"},\"name\":\"kubernetes-dashboard\",\"namespace\":\"kube-system\"},\"spec\":{\"replicas\":1,\"strategy\":{\"rollingUpdate\":{\"maxSurge\":0,\"maxUnavailable\":1},\"type\":\"RollingUpdate\"},\"template\":{\"metadata\":{\"labels\":{\"k8s-app\":\"kubernetes-dashboard\",\"kubernetes.io/cluster-service\":\"true\"}},\"spec\":{\"affinity\":{\"nodeAffinity\":{\"requiredDuringSchedulingIgnoredDuringExecution\":{\"nodeSelectorTerms\":[{\"labelSelector\":null,\"matchExpressions\":[{\"key\":\"kubernetes.azure.com/cluster\",\"operator\":\"Exists\"}]}]}}},\"containers\":[{\"image\":\"aksrepos.azurecr.io/mirror/kubernetes-dashboard-amd64:v1.10.1\",\"livenessProbe\":{\"failureThreshold\":3,\"httpGet\":{\"path\":\"/\",\"port\":9090,\"scheme\":\"HTTP\"},\"initialDelaySeconds\":30,\"periodSeconds\":10,\"successThreshold\":1,\"timeoutSeconds\":30},\"name\":\"main\",\"ports\":[{\"containerPort\":9090,\"name\":\"http\",\"protocol\":\"TCP\"}],\"resources\":{\"limits\":{\"cpu\":\"100m\",\"memory\":\"500Mi\"},\"requests\":{\"cpu\":\"100m\",\"memory\":\"50Mi\"}}}],\"imagePullSecrets\":[{\"name\":\"emptyacrsecret\"}],\"nodeSelector\":{\"beta.kubernetes.io/os\":\"linux\"},\"priorityClassName\":\"system-node-critical\",\"serviceAccountName\":\"kubernetes-dashboard\",\"tolerations\":[{\"key\":\"CriticalAddonsOnly\",\"operator\":\"Exists\"}]}}}}\n" + }, + "creationTimestamp": "2019-03-12T16:38:31Z", + "generation": 6, + "labels": { + "addonmanager.kubernetes.io/mode": "Reconcile", + "k8s-app": "kubernetes-dashboard", + "kubernetes.io/cluster-service": "true" + }, + "name": "kubernetes-dashboard", + "namespace": "kube-system", + "resourceVersion": "15831521", + "selfLink": "/apis/extensions/v1beta1/namespaces/kube-system/deployments/kubernetes-dashboard", + "uid": "45b9cc8d-44e5-11e9-9920-423525a6b683" + }, + "spec": { + "progressDeadlineSeconds": 2147483647, + "replicas": 1, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "k8s-app": "kubernetes-dashboard", + "kubernetes.io/cluster-service": "true" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 0, + "maxUnavailable": 1 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "creationTimestamp": null, + "labels": { + "k8s-app": "kubernetes-dashboard", + "kubernetes.io/cluster-service": "true" + } + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "image": "aksrepos.azurecr.io/mirror/kubernetes-dashboard-amd64:v1.10.1", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/", + "port": 9090, + "scheme": "HTTP" + }, + "initialDelaySeconds": 30, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 30 + }, + "name": "main", + "ports": [ + { + "containerPort": 9090, + "name": "http", + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "cpu": "100m", + "memory": "500Mi" + }, + "requests": { + "cpu": "100m", + "memory": "50Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File" + } + ], + "dnsPolicy": "ClusterFirst", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kubernetes-dashboard", + "serviceAccountName": "kubernetes-dashboard", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + } + ] + } + } + }, + "status": { + "availableReplicas": 1, + "conditions": [ + { + "lastTransitionTime": "2019-03-12T16:38:32Z", + "lastUpdateTime": "2019-03-12T16:38:32Z", + "message": "Deployment has minimum availability.", + "reason": "MinimumReplicasAvailable", + "status": "True", + "type": "Available" + } + ], + "observedGeneration": 6, + "readyReplicas": 1, + "replicas": 1, + "updatedReplicas": 1 + } + }, + { + "apiVersion": "extensions/v1beta1", + "kind": "Deployment", + "metadata": { + "annotations": { + "deployment.kubernetes.io/revision": "5", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"extensions/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"addonmanager.kubernetes.io/mode\":\"Reconcile\",\"k8s-app\":\"metrics-server\",\"kubernetes.io/cluster-service\":\"true\"},\"name\":\"metrics-server\",\"namespace\":\"kube-system\"},\"spec\":{\"selector\":{\"matchLabels\":{\"k8s-app\":\"metrics-server\"}},\"template\":{\"metadata\":{\"labels\":{\"k8s-app\":\"metrics-server\"},\"name\":\"metrics-server\"},\"spec\":{\"affinity\":{\"nodeAffinity\":{\"requiredDuringSchedulingIgnoredDuringExecution\":{\"nodeSelectorTerms\":[{\"labelSelector\":null,\"matchExpressions\":[{\"key\":\"kubernetes.azure.com/cluster\",\"operator\":\"Exists\"}]}]}}},\"containers\":[{\"command\":[\"/metrics-server\",\"--source=kubernetes.summary_api:''\"],\"image\":\"aksrepos.azurecr.io/mirror/metrics-server-amd64:v0.2.1\",\"imagePullPolicy\":\"IfNotPresent\",\"name\":\"metrics-server\"}],\"imagePullSecrets\":[{\"name\":\"emptyacrsecret\"}],\"nodeSelector\":{\"beta.kubernetes.io/os\":\"linux\"},\"priorityClassName\":\"system-node-critical\",\"serviceAccountName\":\"metrics-server\",\"tolerations\":[{\"key\":\"CriticalAddonsOnly\",\"operator\":\"Exists\"}]}}}}\n" + }, + "creationTimestamp": "2019-03-12T16:38:31Z", + "generation": 5, + "labels": { + "addonmanager.kubernetes.io/mode": "Reconcile", + "k8s-app": "metrics-server", + "kubernetes.io/cluster-service": "true" + }, + "name": "metrics-server", + "namespace": "kube-system", + "resourceVersion": "15144043", + "selfLink": "/apis/extensions/v1beta1/namespaces/kube-system/deployments/metrics-server", + "uid": "45556857-44e5-11e9-9920-423525a6b683" + }, + "spec": { + "progressDeadlineSeconds": 2147483647, + "replicas": 1, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "k8s-app": "metrics-server" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 1 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "creationTimestamp": null, + "labels": { + "k8s-app": "metrics-server" + }, + "name": "metrics-server" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/metrics-server", + "--source=kubernetes.summary_api:''" + ], + "image": "aksrepos.azurecr.io/mirror/metrics-server-amd64:v0.2.1", + "imagePullPolicy": "IfNotPresent", + "name": "metrics-server", + "resources": {}, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File" + } + ], + "dnsPolicy": "ClusterFirst", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "metrics-server", + "serviceAccountName": "metrics-server", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + } + ] + } + } + }, + "status": { + "availableReplicas": 1, + "conditions": [ + { + "lastTransitionTime": "2019-03-12T16:38:31Z", + "lastUpdateTime": "2019-03-12T16:38:31Z", + "message": "Deployment has minimum availability.", + "reason": "MinimumReplicasAvailable", + "status": "True", + "type": "Available" + } + ], + "observedGeneration": 5, + "readyReplicas": 1, + "replicas": 1, + "updatedReplicas": 1 + } + }, + { + "apiVersion": "extensions/v1beta1", + "kind": "Deployment", + "metadata": { + "annotations": { + "deployment.kubernetes.io/revision": "7", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"extensions/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"omsagent-rs\",\"namespace\":\"kube-system\"},\"spec\":{\"replicas\":1,\"selector\":{\"matchLabels\":{\"rsName\":\"omsagent-rs\"}},\"strategy\":{\"type\":\"RollingUpdate\"},\"template\":{\"metadata\":{\"annotations\":{\"agentVersion\":\"1.10.0.1\",\"dockerProviderVersion\":\"6.0.0-0\",\"schema-versions\":\"v1\"},\"labels\":{\"rsName\":\"omsagent-rs\"}},\"spec\":{\"containers\":[{\"env\":[{\"name\":\"AKS_RESOURCE_ID\",\"value\":\"/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test\"},{\"name\":\"AKS_REGION\",\"value\":\"eastus\"},{\"name\":\"CONTROLLER_TYPE\",\"value\":\"ReplicaSet\"},{\"name\":\"NODE_IP\",\"valueFrom\":{\"fieldRef\":{\"fieldPath\":\"status.hostIP\"}}}],\"image\":\"mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019\",\"imagePullPolicy\":\"IfNotPresent\",\"livenessProbe\":{\"exec\":{\"command\":[\"/bin/bash\",\"-c\",\"/opt/livenessprobe.sh\"]},\"initialDelaySeconds\":60,\"periodSeconds\":60},\"name\":\"omsagent\",\"ports\":[{\"containerPort\":25225,\"protocol\":\"TCP\"},{\"containerPort\":25224,\"protocol\":\"UDP\"},{\"containerPort\":25227,\"name\":\"in-rs-tcp\",\"protocol\":\"TCP\"}],\"resources\":{\"limits\":{\"cpu\":\"150m\",\"memory\":\"500Mi\"},\"requests\":{\"cpu\":\"110m\",\"memory\":\"250Mi\"}},\"securityContext\":{\"privileged\":true},\"volumeMounts\":[{\"mountPath\":\"/var/run/host\",\"name\":\"docker-sock\"},{\"mountPath\":\"/var/log\",\"name\":\"host-log\"},{\"mountPath\":\"/var/lib/docker/containers\",\"name\":\"containerlog-path\"},{\"mountPath\":\"/etc/kubernetes/host\",\"name\":\"azure-json-path\"},{\"mountPath\":\"/etc/omsagent-secret\",\"name\":\"omsagent-secret\",\"readOnly\":true},{\"mountPath\":\"/etc/config\",\"name\":\"omsagent-rs-config\"},{\"mountPath\":\"/etc/config/settings\",\"name\":\"settings-vol-config\",\"readOnly\":true}]}],\"nodeSelector\":{\"beta.kubernetes.io/os\":\"linux\",\"kubernetes.io/role\":\"agent\"},\"serviceAccountName\":\"omsagent\",\"volumes\":[{\"hostPath\":{\"path\":\"/var/run\"},\"name\":\"docker-sock\"},{\"hostPath\":{\"path\":\"/etc/hostname\"},\"name\":\"container-hostname\"},{\"hostPath\":{\"path\":\"/var/log\"},\"name\":\"host-log\"},{\"hostPath\":{\"path\":\"/var/lib/docker/containers\"},\"name\":\"containerlog-path\"},{\"hostPath\":{\"path\":\"/etc/kubernetes\"},\"name\":\"azure-json-path\"},{\"name\":\"omsagent-secret\",\"secret\":{\"secretName\":\"omsagent-secret\"}},{\"configMap\":{\"name\":\"omsagent-rs-config\"},\"name\":\"omsagent-rs-config\"},{\"configMap\":{\"name\":\"container-azm-ms-agentconfig\",\"optional\":true},\"name\":\"settings-vol-config\"}]}}}}\n" + }, + "creationTimestamp": "2019-08-19T22:44:22Z", + "generation": 7, + "labels": { + "rsName": "omsagent-rs" + }, + "name": "omsagent-rs", + "namespace": "kube-system", + "resourceVersion": "19063500", + "selfLink": "/apis/extensions/v1beta1/namespaces/kube-system/deployments/omsagent-rs", + "uid": "e32d7e82-c2d2-11e9-8736-86290fd7dd1f" + }, + "spec": { + "progressDeadlineSeconds": 2147483647, + "replicas": 1, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "rsName": "omsagent-rs" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 1 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "annotations": { + "agentVersion": "1.10.0.1", + "dockerProviderVersion": "6.0.0-0", + "schema-versions": "v1" + }, + "creationTimestamp": null, + "labels": { + "rsName": "omsagent-rs" + } + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "AKS_RESOURCE_ID", + "value": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test" + }, + { + "name": "AKS_REGION", + "value": "eastus" + }, + { + "name": "CONTROLLER_TYPE", + "value": "ReplicaSet" + }, + { + "name": "NODE_IP", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "status.hostIP" + } + } + } + ], + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/bin/bash", + "-c", + "/opt/livenessprobe.sh" + ] + }, + "failureThreshold": 3, + "initialDelaySeconds": 60, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "omsagent", + "ports": [ + { + "containerPort": 25225, + "protocol": "TCP" + }, + { + "containerPort": 25224, + "protocol": "UDP" + }, + { + "containerPort": 25227, + "name": "in-rs-tcp", + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "cpu": "150m", + "memory": "500Mi" + }, + "requests": { + "cpu": "110m", + "memory": "250Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/host", + "name": "docker-sock" + }, + { + "mountPath": "/var/log", + "name": "host-log" + }, + { + "mountPath": "/var/lib/docker/containers", + "name": "containerlog-path" + }, + { + "mountPath": "/etc/kubernetes/host", + "name": "azure-json-path" + }, + { + "mountPath": "/etc/omsagent-secret", + "name": "omsagent-secret", + "readOnly": true + }, + { + "mountPath": "/etc/config", + "name": "omsagent-rs-config" + }, + { + "mountPath": "/etc/config/settings", + "name": "settings-vol-config", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "nodeSelector": { + "beta.kubernetes.io/os": "linux", + "kubernetes.io/role": "agent" + }, + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "omsagent", + "serviceAccountName": "omsagent", + "terminationGracePeriodSeconds": 30, + "volumes": [ + { + "hostPath": { + "path": "/var/run", + "type": "" + }, + "name": "docker-sock" + }, + { + "hostPath": { + "path": "/etc/hostname", + "type": "" + }, + "name": "container-hostname" + }, + { + "hostPath": { + "path": "/var/log", + "type": "" + }, + "name": "host-log" + }, + { + "hostPath": { + "path": "/var/lib/docker/containers", + "type": "" + }, + "name": "containerlog-path" + }, + { + "hostPath": { + "path": "/etc/kubernetes", + "type": "" + }, + "name": "azure-json-path" + }, + { + "name": "omsagent-secret", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-secret" + } + }, + { + "configMap": { + "defaultMode": 420, + "name": "omsagent-rs-config" + }, + "name": "omsagent-rs-config" + }, + { + "configMap": { + "defaultMode": 420, + "name": "container-azm-ms-agentconfig", + "optional": true + }, + "name": "settings-vol-config" + } + ] + } + } + }, + "status": { + "availableReplicas": 1, + "conditions": [ + { + "lastTransitionTime": "2019-08-19T22:44:22Z", + "lastUpdateTime": "2019-08-19T22:44:22Z", + "message": "Deployment has minimum availability.", + "reason": "MinimumReplicasAvailable", + "status": "True", + "type": "Available" + } + ], + "observedGeneration": 7, + "readyReplicas": 1, + "replicas": 1, + "updatedReplicas": 1 + } + }, + { + "apiVersion": "extensions/v1beta1", + "kind": "Deployment", + "metadata": { + "annotations": { + "deployment.kubernetes.io/revision": "9", + "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"extensions/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"labels\":{\"addonmanager.kubernetes.io/mode\":\"Reconcile\",\"component\":\"tunnel\",\"kubernetes.io/cluster-service\":\"true\",\"tier\":\"node\"},\"name\":\"tunnelfront\",\"namespace\":\"kube-system\"},\"spec\":{\"replicas\":1,\"selector\":{\"matchLabels\":{\"component\":\"tunnel\"}},\"template\":{\"metadata\":{\"labels\":{\"component\":\"tunnel\"}},\"spec\":{\"affinity\":{\"nodeAffinity\":{\"requiredDuringSchedulingIgnoredDuringExecution\":{\"nodeSelectorTerms\":[{\"labelSelector\":null,\"matchExpressions\":[{\"key\":\"kubernetes.azure.com/cluster\",\"operator\":\"Exists\"}]}]}}},\"containers\":[{\"env\":[{\"name\":\"OVERRIDE_TUNNEL_SERVER_NAME\",\"value\":\"t_dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io\"},{\"name\":\"TUNNEL_CLUSTERUSER_NAME\",\"value\":\"28957308\"},{\"name\":\"TUNNELGATEWAY_SERVER_NAME\",\"value\":\"dilipr-hea-dilipr-health-te-72c8e8-0b16acad.tun.eastus.azmk8s.io\"},{\"name\":\"TUNNELGATEWAY_SSH_PORT\",\"value\":\"22\"},{\"name\":\"TUNNELGATEWAY_TLS_PORT\",\"value\":\"443\"},{\"name\":\"KUBE_CONFIG\",\"value\":\"/etc/kubernetes/kubeconfig/kubeconfig\"}],\"image\":\"aksrepos.azurecr.io/prod/hcp-tunnel-front:v1.9.2-v4.0.7\",\"imagePullPolicy\":\"IfNotPresent\",\"livenessProbe\":{\"exec\":{\"command\":[\"/lib/tunnel-front/check-tunnel-connection.sh\"]},\"failureThreshold\":12,\"initialDelaySeconds\":10,\"periodSeconds\":60},\"name\":\"tunnel-front\",\"resources\":{\"requests\":{\"cpu\":\"10m\",\"memory\":\"64Mi\"}},\"securityContext\":{\"privileged\":true},\"volumeMounts\":[{\"mountPath\":\"/etc/kubernetes/kubeconfig\",\"name\":\"kubeconfig\",\"readOnly\":true},{\"mountPath\":\"/etc/kubernetes/certs\",\"name\":\"certificates\",\"readOnly\":true}]}],\"dnsPolicy\":\"Default\",\"imagePullSecrets\":[{\"name\":\"emptyacrsecret\"}],\"nodeSelector\":{\"beta.kubernetes.io/os\":\"linux\"},\"priorityClassName\":\"system-node-critical\",\"serviceAccountName\":\"tunnelfront\",\"tolerations\":[{\"key\":\"CriticalAddonsOnly\",\"operator\":\"Exists\"}],\"volumes\":[{\"configMap\":{\"name\":\"tunnelfront-kubecfg\",\"optional\":true},\"name\":\"kubeconfig\"},{\"hostPath\":{\"path\":\"/etc/kubernetes/certs\"},\"name\":\"certificates\"}]}}}}\n" + }, + "creationTimestamp": "2019-03-12T16:38:32Z", + "generation": 9, + "labels": { + "addonmanager.kubernetes.io/mode": "Reconcile", + "component": "tunnel", + "kubernetes.io/cluster-service": "true", + "tier": "node" + }, + "name": "tunnelfront", + "namespace": "kube-system", + "resourceVersion": "17628811", + "selfLink": "/apis/extensions/v1beta1/namespaces/kube-system/deployments/tunnelfront", + "uid": "45e524e6-44e5-11e9-9920-423525a6b683" + }, + "spec": { + "progressDeadlineSeconds": 2147483647, + "replicas": 1, + "revisionHistoryLimit": 10, + "selector": { + "matchLabels": { + "component": "tunnel" + } + }, + "strategy": { + "rollingUpdate": { + "maxSurge": 1, + "maxUnavailable": 1 + }, + "type": "RollingUpdate" + }, + "template": { + "metadata": { + "creationTimestamp": null, + "labels": { + "component": "tunnel" + } + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "env": [ + { + "name": "OVERRIDE_TUNNEL_SERVER_NAME", + "value": "t_dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "TUNNEL_CLUSTERUSER_NAME", + "value": "28957308" + }, + { + "name": "TUNNELGATEWAY_SERVER_NAME", + "value": "dilipr-hea-dilipr-health-te-72c8e8-0b16acad.tun.eastus.azmk8s.io" + }, + { + "name": "TUNNELGATEWAY_SSH_PORT", + "value": "22" + }, + { + "name": "TUNNELGATEWAY_TLS_PORT", + "value": "443" + }, + { + "name": "KUBE_CONFIG", + "value": "/etc/kubernetes/kubeconfig/kubeconfig" + } + ], + "image": "aksrepos.azurecr.io/prod/hcp-tunnel-front:v1.9.2-v4.0.7", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/lib/tunnel-front/check-tunnel-connection.sh" + ] + }, + "failureThreshold": 12, + "initialDelaySeconds": 10, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "tunnel-front", + "resources": { + "requests": { + "cpu": "10m", + "memory": "64Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/kubernetes/kubeconfig", + "name": "kubeconfig", + "readOnly": true + }, + { + "mountPath": "/etc/kubernetes/certs", + "name": "certificates", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "Default", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "tunnelfront", + "serviceAccountName": "tunnelfront", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "tunnelfront-kubecfg", + "optional": true + }, + "name": "kubeconfig" + }, + { + "hostPath": { + "path": "/etc/kubernetes/certs", + "type": "" + }, + "name": "certificates" + } + ] + } + } + }, + "status": { + "availableReplicas": 1, + "conditions": [ + { + "lastTransitionTime": "2019-03-12T16:38:32Z", + "lastUpdateTime": "2019-03-12T16:38:32Z", + "message": "Deployment has minimum availability.", + "reason": "MinimumReplicasAvailable", + "status": "True", + "type": "Available" + } + ], + "observedGeneration": 9, + "readyReplicas": 1, + "replicas": 1, + "updatedReplicas": 1 + } + } + ], + "kind": "List", + "metadata": { + "resourceVersion": "", + "selfLink": "" + } +} diff --git a/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb b/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb new file mode 100644 index 000000000..074878fe2 --- /dev/null +++ b/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb @@ -0,0 +1,190 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel + +describe 'HealthContainerCpuMemoryAggregator spec' do + + it 'dedupes and drops older records' do + formatted_records = JSON.parse'[{ + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/sidecar", + "CounterName": "memoryRssBytes", + "CounterValue": 14061568, + "Timestamp": "2019-08-23T23:13:39Z" + }, + { + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/952488f3-a1f2-11e9-8b08-d602e29755d5/sidecar", + "CounterName": "memoryRssBytes", + "CounterValue": 14061568, + "Timestamp": "2019-08-23T22:13:39Z" + }]' + + resources = HealthKubernetesResources.instance + nodes = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'nodes.json'))) + pods = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'pods.json'))) + deployments = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'deployments.json'))) + + resources.pod_inventory = pods + resources.node_inventory = nodes + resources.set_deployment_inventory(deployments) + resources.build_pod_uid_lookup #call this in in_kube_health every min + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + cluster_id = 'fake_cluster_id' + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) + deduped_records = aggregator.dedupe_records(formatted_records) + deduped_records.size.must_equal 1 + deduped_records[0]["Timestamp"].must_equal "2019-08-23T23:13:39Z" + end + + it 'aggregates based on container name' do + file = File.read(File.join(File.expand_path(File.dirname(__FILE__)),'cadvisor_perf.json')) + records = JSON.parse(file) + records = records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} + formatted_records = [] + formatter = HealthContainerCpuMemoryRecordFormatter.new + records.each{|record| + formatted_record = formatter.get_record_from_cadvisor_record(record) + formatted_records.push(formatted_record) + } + + resources = HealthKubernetesResources.instance + nodes = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'nodes.json'))) + pods = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'pods.json'))) + deployments = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'deployments.json'))) + + resources.pod_inventory = pods + resources.node_inventory = nodes + resources.set_deployment_inventory(deployments) + resources.build_pod_uid_lookup #call this in in_kube_health every min + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + + cluster_id = 'fake_cluster_id' + + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) + deduped_records = aggregator.dedupe_records(formatted_records) + aggregator.aggregate(deduped_records) + aggregator.compute_state + records = aggregator.get_records + records.size.must_equal 30 + #records have all the required details + records.each{|record| + record["Details"]["details"]["container"].wont_be_nil + record["Details"]["details"]["workload_name"].wont_be_nil + record["Details"]["details"]["workload_kind"].wont_be_nil + record["Details"]["details"]["namespace"].wont_be_nil + record["Details"]["timestamp"].wont_be_nil + record["Details"]["state"].wont_be_nil + record["MonitorTypeId"].wont_be_nil + record["MonitorInstanceId"].wont_be_nil + record["TimeFirstObserved"].wont_be_nil + record["TimeGenerated"].wont_be_nil + } + end + + it "calculates the state correctly" do + file = File.read(File.join(File.expand_path(File.dirname(__FILE__)),'cadvisor_perf.json')) + records = JSON.parse(file) + records = records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} + formatted_records = [] + formatter = HealthContainerCpuMemoryRecordFormatter.new + records.each{|record| + formatted_record = formatter.get_record_from_cadvisor_record(record) + formatted_records.push(formatted_record) + } + + resources = HealthKubernetesResources.instance + nodes = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'nodes.json'))) + pods = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'pods.json'))) + deployments = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'deployments.json'))) + + resources.pod_inventory = pods + resources.node_inventory = nodes + resources.set_deployment_inventory(deployments) + resources.build_pod_uid_lookup #call this in in_kube_health every min + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + + cluster_id = 'fake_cluster_id' + + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) + deduped_records = aggregator.dedupe_records(formatted_records) + aggregator.aggregate(deduped_records) + aggregator.compute_state + records = aggregator.get_records + + #omsagent has limit set. So its state should be set to pass. + #sidecar has no limit set. its state should be set to warning + omsagent_record = records.select{|r| r["MonitorTypeId"] == MonitorId::CONTAINER_CPU_MONITOR_ID && r["Details"]["details"]["container"] == "omsagent"}[0] + sidecar_record = records.select{|r| r["MonitorTypeId"] == MonitorId::CONTAINER_CPU_MONITOR_ID && r["Details"]["details"]["container"] == "sidecar"}[0] + omsagent_record['Details']['state'].must_equal HealthMonitorStates::PASS #limit is set + sidecar_record['Details']['state'].must_equal HealthMonitorStates::PASS + end + + + it "calculates the state as unknown when signals are missing" do + file = File.read(File.join(File.expand_path(File.dirname(__FILE__)),'cadvisor_perf.json')) + records = JSON.parse(file) + records = records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} + formatted_records = [] + formatter = HealthContainerCpuMemoryRecordFormatter.new + records.each{|record| + formatted_record = formatter.get_record_from_cadvisor_record(record) + formatted_records.push(formatted_record) + } + + formatted_records = formatted_records.reject{|r| r["InstanceName"] == "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/69e68b21-c5df-11e9-8736-86290fd7dd1f/omsagent" && r["CounterName"] == "cpuUsageNanoCores"} + formatted_records = formatted_records.reject{|r| r["InstanceName"] == "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/b1e04e1c-c5df-11e9-8736-86290fd7dd1f/omsagent" && r["CounterName"] == "cpuUsageNanoCores"} + + resources = HealthKubernetesResources.instance + nodes = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'nodes.json'))) + pods = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'pods.json'))) + deployments = JSON.parse(File.read(File.join(File.expand_path(File.dirname(__FILE__)),'deployments.json'))) + + resources.pod_inventory = pods + resources.node_inventory = nodes + resources.set_deployment_inventory(deployments) + resources.build_pod_uid_lookup #call this in in_kube_health every min + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + + cluster_id = 'fake_cluster_id' + + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) + deduped_records = aggregator.dedupe_records(formatted_records) + aggregator.aggregate(deduped_records) + aggregator.compute_state + records = aggregator.get_records + + #removed(missed) omsagent records should result in state being unknown + omsagent_record = records.select{|r| r["MonitorTypeId"] == MonitorId::CONTAINER_CPU_MONITOR_ID && r["Details"]["details"]["container"] == "omsagent" && !r["Details"]["details"]["workload_name"].include?("omsagent-rs") }[0] + omsagent_record['Details']['state'].must_equal HealthMonitorStates::UNKNOWN #limit is set + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_container_cpu_memory_record_formatter_spec.rb b/test/code/plugin/health/health_container_cpu_memory_record_formatter_spec.rb new file mode 100644 index 000000000..d01922bce --- /dev/null +++ b/test/code/plugin/health/health_container_cpu_memory_record_formatter_spec.rb @@ -0,0 +1,58 @@ +require_relative '../test_helpers' +Dir[File.join(File.expand_path(File.dirname(__FILE__)), "../../../../source/code/plugin/health/*.rb")].reject{|f| f.include?('health_monitor_utils')}.each { |file| require file } +include HealthModel +include Minitest + +describe "HealthContainerCpuMemoryRecordFormatter spec" do + it 'returns the record in expected format when cadvisor record is well formed' do + formatter = HealthContainerCpuMemoryRecordFormatter.new + cadvisor_record = JSON.parse('{ + "DataItems": [ + { + "Timestamp": "2019-08-01T23:19:19Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/6708e4ac-b49a-11e9-8a49-52a94e80d897/omsagent", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 85143552 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }') + record = formatter.get_record_from_cadvisor_record(cadvisor_record) + record.wont_equal nil + record["InstanceName"].must_equal "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/6708e4ac-b49a-11e9-8a49-52a94e80d897/omsagent" + record["CounterName"].must_equal "memoryWorkingSetBytes" + record["CounterValue"].must_equal 85143552 + record["Timestamp"].must_equal "2019-08-01T23:19:19Z" + end + + it 'returns nil for invalid cadvisor record' do + formatter = HealthContainerCpuMemoryRecordFormatter.new + cadvisor_record = JSON.parse('{ + "DataItms": [ + { + "Timestamp": "2019-08-01T23:19:19Z", + "Host": "aks-nodepool1-19574989-2", + "ObjectName": "K8SContainer", + "InstanceName": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test/6708e4ac-b49a-11e9-8a49-52a94e80d897/omsagent", + "Collections": [ + { + "CounterName": "memoryWorkingSetBytes", + "Value": 85143552 + } + ] + } + ], + "DataType": "LINUX_PERF_BLOB", + "IPName": "LogManagement" + }') + record = formatter.get_record_from_cadvisor_record(cadvisor_record) + record.must_be_nil + end +end \ No newline at end of file diff --git a/test/code/plugin/health/health_kubernetes_resource_spec.rb b/test/code/plugin/health/health_kubernetes_resource_spec.rb index c27d969ec..dbeec4858 100644 --- a/test/code/plugin/health/health_kubernetes_resource_spec.rb +++ b/test/code/plugin/health/health_kubernetes_resource_spec.rb @@ -207,7 +207,7 @@ resources = HealthKubernetesResources.instance resources.node_inventory = nodes resources.pod_inventory = pods - resources.deployment_inventory = deployments + resources.set_deployment_inventory(deployments) #act parsed_nodes = resources.get_nodes parsed_workloads = resources.get_workload_names @@ -217,6 +217,28 @@ assert_equal parsed_workloads.size, 3 assert_equal parsed_nodes, ['aks-nodepool1-19574989-0', 'aks-nodepool1-19574989-1'] - assert_equal parsed_workloads, ['default~~diliprdeploymentnodeapps', 'default~~rss-site', 'kube-system~~kube-proxy'] + parsed_workloads.sort.must_equal ['default~~diliprdeploymentnodeapps', 'default~~rss-site', 'kube-system~~kube-proxy'].sort end + + # it 'builds the pod_uid lookup correctly' do + # #arrange + # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json') + # nodes = JSON.parse(f) + # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json') + # pods = JSON.parse(f) + # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json') + # deployments = JSON.parse(f) + + # resources = HealthKubernetesResources.instance + + # resources.node_inventory = nodes + # resources.pod_inventory = pods + # resources.set_deployment_inventory(deployments) #resets deployment_lookup -- this was causing Unit test failures + + # resources.build_pod_uid_lookup + + # resources.pod_uid_lookup + # resources.workload_container_count + + # end end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_builder_test.rb b/test/code/plugin/health/health_model_builder_test.rb index df921049c..a7c5e0927 100644 --- a/test/code/plugin/health/health_model_builder_test.rb +++ b/test/code/plugin/health/health_model_builder_test.rb @@ -64,10 +64,10 @@ def test_event_stream resources = HealthKubernetesResources.instance resources.node_inventory = node_inventory resources.pod_inventory = pod_inventory - resources.deployment_inventory = deployment_inventory + resources.set_deployment_inventory(deployment_inventory) workload_names = resources.get_workload_names - provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../..//installer/conf/healthmonitorconfig.json")) + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) health_monitor_records = [] records.each do |record| @@ -334,4 +334,162 @@ def test_event_stream_aks_engine after_state.initialize_state(deserialized_state) end end + + def test_container_memory_cpu_with_model + health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') + health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + monitor_factory = MonitorFactory.new + hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + state_finalizers = [AggregateMonitorStateFinalizer.new] + monitor_set = MonitorSet.new + model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + nodes_file_map = { + "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + } + + pods_file_map = { + "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + } + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + + cluster_id = 'fake_cluster_id' + + #test + state = HealthMonitorState.new() + generator = HealthMissingSignalGenerator.new + + mock_data_path = "C:/Users/dilipr/desktop/health/container_cpu_memory/daemonset.json" + file = File.read(mock_data_path) + records = JSON.parse(file) + + node_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json")) + pod_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json")) + deployment_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json")) + resources = HealthKubernetesResources.instance + resources.node_inventory = node_inventory + resources.pod_inventory = pod_inventory + resources.set_deployment_inventory(deployment_inventory) + + workload_names = resources.get_workload_names + provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + + #container memory cpu records + file = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/cadvisor_perf.json') + cadvisor_records = JSON.parse(file) + cadvisor_records = cadvisor_records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} + formatted_records = [] + formatter = HealthContainerCpuMemoryRecordFormatter.new + cadvisor_records.each{|record| + formatted_record = formatter.get_record_from_cadvisor_record(record) + formatted_records.push(formatted_record) + } + + resources.build_pod_uid_lookup #call this in in_kube_health every min + + cluster_labels = { + 'container.azm.ms/cluster-region' => 'eastus', + 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + 'container.azm.ms/cluster-name' => 'dilipr-health-test' + } + + cluster_id = 'fake_cluster_id' + + aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) + deduped_records = aggregator.dedupe_records(formatted_records) + aggregator.aggregate(deduped_records) + aggregator.compute_state + container_cpu_memory_records = aggregator.get_records + + records.concat(container_cpu_memory_records) + + health_monitor_records = [] + records.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + provider.get_labels(record), + provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + + state.update_state(health_monitor_record, + provider.get_config(health_monitor_record.monitor_id) + ) + + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + health_monitor_record.state = state.get_state(monitor_instance_id).new_state + health_monitor_records.push(health_monitor_record) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + #handle kube api down + kube_api_down_handler = HealthKubeApiDownHandler.new + health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + reducer = HealthSignalReducer.new() + reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + cluster_id = 'fake_cluster_id' + + #get the list of 'none' and 'unknown' signals + missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + #update state for missing signals + missing_signals.each{|signal| + state.update_state(signal, + provider.get_config(signal.monitor_id) + ) + } + generator.update_last_received_records(reduced_records) + reduced_records.push(*missing_signals) + + # build the health model + all_records = reduced_records + model_builder.process_records(all_records) + all_monitors = model_builder.finalize_model + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each{|monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + state.update_state(monitor, + provider.get_config(monitor.monitor_id) + ) + end + + instance_state = state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send + + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + records_to_send = [] + all_monitors.keys.each{|key| + record = provider.get_record(all_monitors[key], state) + #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + } + end end \ No newline at end of file diff --git a/test/code/plugin/health/nodes.json b/test/code/plugin/health/nodes.json new file mode 100644 index 000000000..f1721e076 --- /dev/null +++ b/test/code/plugin/health/nodes.json @@ -0,0 +1,1966 @@ +{ + "apiVersion": "v1", + "items": [ + { + "apiVersion": "v1", + "kind": "Node", + "metadata": { + "annotations": { + "node.alpha.kubernetes.io/ttl": "0", + "volumes.kubernetes.io/controller-managed-attach-detach": "true" + }, + "creationTimestamp": "2019-03-12T16:40:36Z", + "labels": { + "agentpool": "nodepool1", + "beta.kubernetes.io/arch": "amd64", + "beta.kubernetes.io/instance-type": "Standard_DS1_v2", + "beta.kubernetes.io/os": "linux", + "failure-domain.beta.kubernetes.io/region": "eastus", + "failure-domain.beta.kubernetes.io/zone": "0", + "kubernetes.azure.com/cluster": "MC_dilipr-health-test_dilipr-health-test_eastus", + "kubernetes.io/hostname": "aks-nodepool1-19574989-0", + "kubernetes.io/role": "agent", + "node-role.kubernetes.io/agent": "", + "storageprofile": "managed", + "storagetier": "Premium_LRS" + }, + "name": "aks-nodepool1-19574989-0", + "resourceVersion": "19068106", + "selfLink": "/api/v1/nodes/aks-nodepool1-19574989-0", + "uid": "9012b16c-44e5-11e9-9920-423525a6b683" + }, + "spec": { + "podCIDR": "10.244.1.0/24", + "providerID": "azure:///subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/MC_dilipr-health-test_dilipr-health-test_eastus/providers/Microsoft.Compute/virtualMachines/aks-nodepool1-19574989-0" + }, + "status": { + "addresses": [ + { + "address": "aks-nodepool1-19574989-0", + "type": "Hostname" + }, + { + "address": "10.240.0.4", + "type": "InternalIP" + } + ], + "allocatable": { + "cpu": "940m", + "ephemeral-storage": "28043041951", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "2504708Ki", + "pods": "110" + }, + "capacity": { + "cpu": "1", + "ephemeral-storage": "30428648Ki", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "3524612Ki", + "pods": "110" + }, + "conditions": [ + { + "lastHeartbeatTime": "2019-03-12T16:42:18Z", + "lastTransitionTime": "2019-03-12T16:42:18Z", + "message": "RouteController created a route", + "reason": "RouteCreated", + "status": "False", + "type": "NetworkUnavailable" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-07-29T08:16:01Z", + "message": "kubelet has sufficient disk space available", + "reason": "KubeletHasSufficientDisk", + "status": "False", + "type": "OutOfDisk" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-07-29T08:16:01Z", + "message": "kubelet has sufficient memory available", + "reason": "KubeletHasSufficientMemory", + "status": "False", + "type": "MemoryPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-07-29T08:16:01Z", + "message": "kubelet has no disk pressure", + "reason": "KubeletHasNoDiskPressure", + "status": "False", + "type": "DiskPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-03-12T16:40:36Z", + "message": "kubelet has sufficient PID available", + "reason": "KubeletHasSufficientPID", + "status": "False", + "type": "PIDPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-07-29T08:16:01Z", + "message": "kubelet is posting ready status. AppArmor enabled", + "reason": "KubeletReady", + "status": "True", + "type": "Ready" + } + ], + "daemonEndpoints": { + "kubeletEndpoint": { + "Port": 10250 + } + }, + "images": [ + { + "names": [ + "nickchase/rss-php-nginx@sha256:48da56a77fe4ecff4917121365d8e0ce615ebbdfe31f48a996255f5592894e2b", + "nickchase/rss-php-nginx:v1" + ], + "sizeBytes": 677038498 + }, + { + "names": [ + "rdilip83/logeverysecond@sha256:6fe5624808609c507178a77f94384fb9794a4d6b7d102ed8016a4baf608164a1", + "rdilip83/logeverysecond:v2" + ], + "sizeBytes": 674931590 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "k8s.gcr.io/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "k8s.gcr.io/hyperkube-amd64:v1.11.8" + ], + "sizeBytes": 615263658 + }, + { + "names": [ + "microsoft/oms@sha256:de83d1df24cb86a3a3110bd03abbd5704d7a7345565b1996f49ff001a3665385", + "microsoft/oms:healthpreview04262019" + ], + "sizeBytes": 514907213 + }, + { + "names": [ + "rdilip83/fixrubyerror@sha256:6b7f36cf6258b311015493ab025f06577d758c45bc5010d022ac160b9f40ea5d", + "rdilip83/fixrubyerror:latest" + ], + "sizeBytes": 494068028 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019" + ], + "sizeBytes": 494067935 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:fb2b90ce9bf7186fd9dfae97f5f72f9b9c80c8a0493af3cff74179cd4ff847c0", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08212019" + ], + "sizeBytes": 494067572 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c646e180483d295ffac114fb9df513db02553af7879681814d5910764653dd2d", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08202019" + ], + "sizeBytes": 494067210 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c21b596a22a1338ed293d01681f327acc871ee502ed779ec1109d6a93375bb3b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08192019" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "rdilip83/cifeatureprod08192019@sha256:7815bba9a805e4e8df33356fd532671de45525ce9c6e936e14f9b126e2097ecd", + "rdilip83/cifeatureprod08192019:v1" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:59e34aab9f6e16a87e880b1ee1c9dd5434ee40dd29502e74aceefabf51443717", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:internaltesthealth08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:6387d0dedf4de0bab430f681ef61361f63a20e1c4c287a9b60ea5460283ac6cf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ci_feature_prod_health08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/hc08192019@sha256:014d936771508d499ac4c15043e23b16bce8de0019fb2048b99540cbe9084895", + "rdilip83/hc08192019:1" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/health-rc@sha256:8ad12bce5ffd27b301bc6fe4355c8affa6fce080ae7e2291dec3a0ed11bb9483", + "rdilip83/health-rc:3" + ], + "sizeBytes": 494052863 + }, + { + "names": [ + "rdilip83/health_ci_feature_image@sha256:1a574d25884483083e8cbaacbf0cb7c4e442dc736d480615c65f5c71f8969b13", + "rdilip83/health_ci_feature_image:v1" + ], + "sizeBytes": 494052147 + }, + { + "names": [ + "rdilip83/healthrc@sha256:816c8cef09822daf050a0fca6f92e7ac19147ff4bf1a823d43fe70f73470cc0c", + "rdilip83/healthrc:v3" + ], + "sizeBytes": 494052138 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:d35aac044d1adc3d02269fde78f8dfd923db94b81288447cf6fdd482970a333b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthmerge08142019" + ], + "sizeBytes": 494052135 + }, + { + "names": [ + "rdilip83/healthrc@sha256:a130780e56ac0edb3ca29477e12edd5e9b5d08b5732dbd59ede9beb58e21eca7", + "rdilip83/healthrc:v2" + ], + "sizeBytes": 494051682 + }, + { + "names": [ + "rdilip83/healthmerge@sha256:24d270b0f59fb484c283922474736c3cba50f8aad0270bc0a3acd14284694eea", + "rdilip83/healthmerge:v8" + ], + "sizeBytes": 494010139 + }, + { + "names": [ + "rdilip83/health-rc@sha256:b1d24728eb808d301da426b76b7f7b79606204c4c2b695a24ac670be8276d55d", + "rdilip83/health-rc:1" + ], + "sizeBytes": 494000891 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:a0666957cccbfdf5784accd1133408bf017c28a6e694d9a2ae74da94eef2d285", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview08012019" + ], + "sizeBytes": 493994261 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:32c9b35a6809c54d5296e2ca2b122b35a4ad8c852622174cc5a9f92cc27e56e4", + "rdilip83/mergehealth:v3" + ], + "sizeBytes": 493988815 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:a3521e8f36e007b3cb949e0356a75394ac61fd2024ca1ec4827b8d54fb068534", + "rdilip83/mergehealth:v1" + ], + "sizeBytes": 493981585 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:0438e4690e042b195917e160b8949aeb339520ee19c898a8bb9452f36d1f84f1", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + ], + "sizeBytes": 493977357 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:9ebc410a36856176921dba81b5bd43132469209b315f52be346690435419b9bb" + ], + "sizeBytes": 493946790 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:4e51195a9c77bd166fc90ee5f6143a4604b502ab7ef0f06431dec10c341b10f3", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + ], + "sizeBytes": 493893635 + }, + { + "names": [ + "rdilip83/healthpreview06272019@sha256:d888ba5ff5e5810113a32f9c9812a5e28088cc81b902e95a185fe465a514029c", + "rdilip83/healthpreview06272019:latest" + ], + "sizeBytes": 493893633 + }, + { + "names": [ + "rdilip83/healthpreview06252019-1@sha256:1561876cffe94433a569f29f5231548e039193ebaa7ec640d22439675179e43f", + "rdilip83/healthpreview06252019-1:latest" + ], + "sizeBytes": 493887387 + }, + { + "names": [ + "rdilip83/healthpreview06252019@sha256:6597ff599a78ac452a4138dedb9e08c0ccd3e8b01594b033fd78ba9dbb41fe9e", + "rdilip83/healthpreview06252019:latest" + ], + "sizeBytes": 493887384 + }, + { + "names": [ + "rdilip83/healthpreview06242019@sha256:c4f565d92086d1ee56e6016178fed5c668352dc0ca0047f02910bdcb87a482c4", + "rdilip83/healthpreview06242019:latest" + ], + "sizeBytes": 493850850 + }, + { + "names": [ + "rdilip83/healthpreview06212019-1@sha256:937ce5801a0097a1cbc4eff5399c1973b4c6223ece9279b35207368b99f82b96", + "rdilip83/healthpreview06212019-1:latest" + ], + "sizeBytes": 493850674 + }, + { + "names": [ + "rdilip83/healthpreview06192019@sha256:f92cb5283814d446f0acde6a489648ea197496d5f85b27ca959ec97bce742d8a", + "rdilip83/healthpreview06192019:latest" + ], + "sizeBytes": 493799437 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:0f798cb7d56931b231f71e38e7fa5bf898b69e611247a566701f70a5f29a9799", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07092019" + ], + "sizeBytes": 467692116 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:3734a084fa9681c7e930eb90cad45a8f282c24af63065a720a2327b1683f3ba4", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019" + ], + "sizeBytes": 466882569 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:16402c34e2d7de72c2ebc18ec8e9f7933fa25f6a7f83bceb84483ba95e3902f7", + "rdilip83/mergehealth:v2" + ], + "sizeBytes": 448931997 + }, + { + "names": [ + "rdilip83/healthpreview06212019@sha256:5860c9caaf544f2e7c46edad5cdfb69e22398e20dc87cb8a4cd630b5b7000074", + "rdilip83/healthpreview06212019:latest" + ], + "sizeBytes": 448366491 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/hcp-tunnel-front@sha256:68878ee3ea1781b322ea3952c3370e31dd89be8bb0864e2bf27bdba6dc904c41", + "aksrepos.azurecr.io/prod/hcp-tunnel-front@sha256:68878ee3ea1781b322ea3952c3370e31dd89be8bb0864e2bf27bdba6dc904c41", + "aksrepos.azurecr.io/mirror/hcp-tunnel-front:v1.9.2-v4.0.7", + "aksrepos.azurecr.io/prod/hcp-tunnel-front:v1.9.2-v4.0.7" + ], + "sizeBytes": 383483267 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/kubernetes-dashboard-amd64@sha256:0ae6b69432e78069c5ce2bcde0fe409c5c4d6f0f4d9cd50a17974fea38898747", + "k8s.gcr.io/kubernetes-dashboard-amd64@sha256:0ae6b69432e78069c5ce2bcde0fe409c5c4d6f0f4d9cd50a17974fea38898747", + "aksrepos.azurecr.io/mirror/kubernetes-dashboard-amd64:v1.10.1", + "k8s.gcr.io/kubernetes-dashboard-amd64:v1.10.1" + ], + "sizeBytes": 121711221 + }, + { + "names": [ + "nginx@sha256:23b4dcdf0d34d4a129755fc6f52e1c6e23bb34ea011b315d87e193033bcd1b68" + ], + "sizeBytes": 109331233 + }, + { + "names": [ + "nginx@sha256:bdbf36b7f1f77ffe7bd2a32e59235dff6ecf131e3b6b5b96061c652f30685f3a", + "nginx:latest" + ], + "sizeBytes": 109258867 + }, + { + "names": [ + "nginx@sha256:b73f527d86e3461fd652f62cf47e7b375196063bbbd503e853af5be16597cb2e", + "nginx:1.15.5" + ], + "sizeBytes": 109083698 + }, + { + "names": [ + "debian@sha256:118cf8f3557e1ea766c02f36f05f6ac3e63628427ea8965fb861be904ec35a6f", + "debian:latest" + ], + "sizeBytes": 100594230 + }, + { + "names": [ + "nginx@sha256:e3456c851a152494c3e4ff5fcc26f240206abac0c9d794affb40e0714846c451", + "nginx:1.7.9" + ], + "sizeBytes": 91664166 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "aksrepos.azurecr.io/prod/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "deis/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "aksrepos.azurecr.io/mirror/kube-svc-redirect:v1.0.2", + "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2" + ], + "sizeBytes": 82897218 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/heapster-amd64@sha256:fc33c690a3a446de5abc24b048b88050810a58b9e4477fa763a43d7df029301a", + "k8s.gcr.io/heapster-amd64@sha256:fc33c690a3a446de5abc24b048b88050810a58b9e4477fa763a43d7df029301a", + "aksrepos.azurecr.io/mirror/heapster-amd64:v1.5.3", + "k8s.gcr.io/heapster-amd64:v1.5.3" + ], + "sizeBytes": 75318342 + }, + { + "names": [ + "vishiy/hello@sha256:99d60766e39df52d28fe8db9c659633d96ba1d84fd672298dce047d8a86c478a", + "vishiy/hello:err100eps" + ], + "sizeBytes": 54649865 + }, + { + "names": [ + "k8s.gcr.io/k8s-dns-kube-dns-amd64@sha256:618a82fa66cf0c75e4753369a6999032372be7308866fc9afb381789b1e5ad52", + "k8s.gcr.io/k8s-dns-kube-dns-amd64:1.14.13" + ], + "sizeBytes": 51157394 + }, + { + "names": [ + "k8s.gcr.io/metrics-server-amd64@sha256:49a9f12f7067d11f42c803dbe61ed2c1299959ad85cb315b25ff7eef8e6b8892", + "k8s.gcr.io/metrics-server-amd64:v0.2.1" + ], + "sizeBytes": 42541759 + }, + { + "names": [ + "k8s.gcr.io/k8s-dns-sidecar-amd64@sha256:4f1ab957f87b94a5ec1edc26fae50da2175461f00afecf68940c4aa079bd08a4", + "k8s.gcr.io/k8s-dns-sidecar-amd64:1.14.10" + ], + "sizeBytes": 41635309 + }, + { + "names": [ + "k8s.gcr.io/k8s-dns-dnsmasq-nanny-amd64@sha256:bbb2a290a568125b3b996028958eb773f33b5b87a6b37bf38a28f8b62dddb3c8", + "k8s.gcr.io/k8s-dns-dnsmasq-nanny-amd64:1.14.10" + ], + "sizeBytes": 40372149 + } + ], + "nodeInfo": { + "architecture": "amd64", + "bootID": "d8f6c00f-a085-450e-bf5c-12e651a0fcfc", + "containerRuntimeVersion": "docker://3.0.4", + "kernelVersion": "4.15.0-1037-azure", + "kubeProxyVersion": "v1.11.8", + "kubeletVersion": "v1.11.8", + "machineID": "cc9ed99e383540a4b0379995bb779221", + "operatingSystem": "linux", + "osImage": "Ubuntu 16.04.5 LTS", + "systemUUID": "301B3B88-C7BD-3D45-A3CB-3CD66A42EB6F" + } + } + }, + { + "apiVersion": "v1", + "kind": "Node", + "metadata": { + "annotations": { + "node.alpha.kubernetes.io/ttl": "0", + "volumes.kubernetes.io/controller-managed-attach-detach": "true" + }, + "creationTimestamp": "2019-03-12T16:40:33Z", + "labels": { + "agentpool": "nodepool1", + "beta.kubernetes.io/arch": "amd64", + "beta.kubernetes.io/instance-type": "Standard_DS1_v2", + "beta.kubernetes.io/os": "linux", + "failure-domain.beta.kubernetes.io/region": "eastus", + "failure-domain.beta.kubernetes.io/zone": "1", + "kubernetes.azure.com/cluster": "MC_dilipr-health-test_dilipr-health-test_eastus", + "kubernetes.io/hostname": "aks-nodepool1-19574989-1", + "kubernetes.io/role": "agent", + "node-role.kubernetes.io/agent": "", + "storageprofile": "managed", + "storagetier": "Premium_LRS" + }, + "name": "aks-nodepool1-19574989-1", + "resourceVersion": "19068104", + "selfLink": "/api/v1/nodes/aks-nodepool1-19574989-1", + "uid": "8e1b5c77-44e5-11e9-9920-423525a6b683" + }, + "spec": { + "podCIDR": "10.244.0.0/24", + "providerID": "azure:///subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/MC_dilipr-health-test_dilipr-health-test_eastus/providers/Microsoft.Compute/virtualMachines/aks-nodepool1-19574989-1" + }, + "status": { + "addresses": [ + { + "address": "aks-nodepool1-19574989-1", + "type": "Hostname" + }, + { + "address": "10.240.0.5", + "type": "InternalIP" + } + ], + "allocatable": { + "cpu": "940m", + "ephemeral-storage": "28043041951", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "2504708Ki", + "pods": "110" + }, + "capacity": { + "cpu": "1", + "ephemeral-storage": "30428648Ki", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "3524612Ki", + "pods": "110" + }, + "conditions": [ + { + "lastHeartbeatTime": "2019-03-12T16:42:30Z", + "lastTransitionTime": "2019-03-12T16:42:30Z", + "message": "RouteController created a route", + "reason": "RouteCreated", + "status": "False", + "type": "NetworkUnavailable" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:21Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet has sufficient disk space available", + "reason": "KubeletHasSufficientDisk", + "status": "False", + "type": "OutOfDisk" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:21Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet has sufficient memory available", + "reason": "KubeletHasSufficientMemory", + "status": "False", + "type": "MemoryPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:21Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet has no disk pressure", + "reason": "KubeletHasNoDiskPressure", + "status": "False", + "type": "DiskPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:21Z", + "lastTransitionTime": "2019-03-12T16:40:33Z", + "message": "kubelet has sufficient PID available", + "reason": "KubeletHasSufficientPID", + "status": "False", + "type": "PIDPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:21Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet is posting ready status. AppArmor enabled", + "reason": "KubeletReady", + "status": "True", + "type": "Ready" + } + ], + "daemonEndpoints": { + "kubeletEndpoint": { + "Port": 10250 + } + }, + "images": [ + { + "names": [ + "perl@sha256:268e7af9853bcc6d2100e2ad76e928c2ca861518217c269b8a762849a8617c12", + "perl:latest" + ], + "sizeBytes": 890592834 + }, + { + "names": [ + "nickchase/rss-php-nginx@sha256:48da56a77fe4ecff4917121365d8e0ce615ebbdfe31f48a996255f5592894e2b", + "nickchase/rss-php-nginx:v1" + ], + "sizeBytes": 677038498 + }, + { + "names": [ + "rdilip83/jsonlogger@sha256:82b67ca5e0650cd5e47f5b51659d61cee035e5d8dcd8a79c50358cd2beb3b5a8", + "rdilip83/jsonlogger:v12" + ], + "sizeBytes": 676594134 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "k8s.gcr.io/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "k8s.gcr.io/hyperkube-amd64:v1.11.8" + ], + "sizeBytes": 615263658 + }, + { + "names": [ + "rdilip83/fixrubyerror@sha256:6b7f36cf6258b311015493ab025f06577d758c45bc5010d022ac160b9f40ea5d", + "rdilip83/fixrubyerror:latest" + ], + "sizeBytes": 494068028 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019" + ], + "sizeBytes": 494067935 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:fb2b90ce9bf7186fd9dfae97f5f72f9b9c80c8a0493af3cff74179cd4ff847c0", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08212019" + ], + "sizeBytes": 494067572 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c646e180483d295ffac114fb9df513db02553af7879681814d5910764653dd2d", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08202019" + ], + "sizeBytes": 494067210 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c21b596a22a1338ed293d01681f327acc871ee502ed779ec1109d6a93375bb3b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08192019" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "rdilip83/cifeatureprod08192019@sha256:7815bba9a805e4e8df33356fd532671de45525ce9c6e936e14f9b126e2097ecd", + "rdilip83/cifeatureprod08192019:v1" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:59e34aab9f6e16a87e880b1ee1c9dd5434ee40dd29502e74aceefabf51443717", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:internaltesthealth08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:6387d0dedf4de0bab430f681ef61361f63a20e1c4c287a9b60ea5460283ac6cf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ci_feature_prod_health08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/hc08192019@sha256:014d936771508d499ac4c15043e23b16bce8de0019fb2048b99540cbe9084895", + "rdilip83/hc08192019:1" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/health-rc@sha256:8ad12bce5ffd27b301bc6fe4355c8affa6fce080ae7e2291dec3a0ed11bb9483", + "rdilip83/health-rc:3" + ], + "sizeBytes": 494052863 + }, + { + "names": [ + "rdilip83/health_ci_feature_image@sha256:1a574d25884483083e8cbaacbf0cb7c4e442dc736d480615c65f5c71f8969b13", + "rdilip83/health_ci_feature_image:v1" + ], + "sizeBytes": 494052147 + }, + { + "names": [ + "rdilip83/healthrc@sha256:816c8cef09822daf050a0fca6f92e7ac19147ff4bf1a823d43fe70f73470cc0c", + "rdilip83/healthrc:v3" + ], + "sizeBytes": 494052138 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:d35aac044d1adc3d02269fde78f8dfd923db94b81288447cf6fdd482970a333b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthmerge08142019" + ], + "sizeBytes": 494052135 + }, + { + "names": [ + "rdilip83/healthrc@sha256:a130780e56ac0edb3ca29477e12edd5e9b5d08b5732dbd59ede9beb58e21eca7", + "rdilip83/healthrc:v2" + ], + "sizeBytes": 494051682 + }, + { + "names": [ + "rdilip83/healthmerge@sha256:24d270b0f59fb484c283922474736c3cba50f8aad0270bc0a3acd14284694eea", + "rdilip83/healthmerge:v8" + ], + "sizeBytes": 494010139 + }, + { + "names": [ + "rdilip83/health-rc@sha256:b1d24728eb808d301da426b76b7f7b79606204c4c2b695a24ac670be8276d55d", + "rdilip83/health-rc:1" + ], + "sizeBytes": 494000891 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:a0666957cccbfdf5784accd1133408bf017c28a6e694d9a2ae74da94eef2d285", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview08012019" + ], + "sizeBytes": 493994261 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:32c9b35a6809c54d5296e2ca2b122b35a4ad8c852622174cc5a9f92cc27e56e4", + "rdilip83/mergehealth:v3" + ], + "sizeBytes": 493988815 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:a3521e8f36e007b3cb949e0356a75394ac61fd2024ca1ec4827b8d54fb068534", + "rdilip83/mergehealth:v1" + ], + "sizeBytes": 493981585 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:0438e4690e042b195917e160b8949aeb339520ee19c898a8bb9452f36d1f84f1", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + ], + "sizeBytes": 493977357 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:9ebc410a36856176921dba81b5bd43132469209b315f52be346690435419b9bb" + ], + "sizeBytes": 493946790 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:4e51195a9c77bd166fc90ee5f6143a4604b502ab7ef0f06431dec10c341b10f3", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + ], + "sizeBytes": 493893635 + }, + { + "names": [ + "rdilip83/healthpreview06272019@sha256:d888ba5ff5e5810113a32f9c9812a5e28088cc81b902e95a185fe465a514029c", + "rdilip83/healthpreview06272019:latest" + ], + "sizeBytes": 493893633 + }, + { + "names": [ + "rdilip83/healthpreview06252019-1@sha256:1561876cffe94433a569f29f5231548e039193ebaa7ec640d22439675179e43f", + "rdilip83/healthpreview06252019-1:latest" + ], + "sizeBytes": 493887387 + }, + { + "names": [ + "rdilip83/healthpreview06252019@sha256:6597ff599a78ac452a4138dedb9e08c0ccd3e8b01594b033fd78ba9dbb41fe9e", + "rdilip83/healthpreview06252019:latest" + ], + "sizeBytes": 493887384 + }, + { + "names": [ + "rdilip83/healthpreview06242019@sha256:c4f565d92086d1ee56e6016178fed5c668352dc0ca0047f02910bdcb87a482c4", + "rdilip83/healthpreview06242019:latest" + ], + "sizeBytes": 493850850 + }, + { + "names": [ + "rdilip83/healthpreview06212019-1@sha256:937ce5801a0097a1cbc4eff5399c1973b4c6223ece9279b35207368b99f82b96", + "rdilip83/healthpreview06212019-1:latest" + ], + "sizeBytes": 493850674 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:0f798cb7d56931b231f71e38e7fa5bf898b69e611247a566701f70a5f29a9799", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07092019" + ], + "sizeBytes": 467692116 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:3734a084fa9681c7e930eb90cad45a8f282c24af63065a720a2327b1683f3ba4", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019" + ], + "sizeBytes": 466882569 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:16402c34e2d7de72c2ebc18ec8e9f7933fa25f6a7f83bceb84483ba95e3902f7", + "rdilip83/mergehealth:v2" + ], + "sizeBytes": 448931997 + }, + { + "names": [ + "deis/hcp-tunnel-front@sha256:68878ee3ea1781b322ea3952c3370e31dd89be8bb0864e2bf27bdba6dc904c41", + "deis/hcp-tunnel-front:v1.9.2-v4.0.7" + ], + "sizeBytes": 383483267 + }, + { + "names": [ + "nginx@sha256:23b4dcdf0d34d4a129755fc6f52e1c6e23bb34ea011b315d87e193033bcd1b68" + ], + "sizeBytes": 109331233 + }, + { + "names": [ + "nginx@sha256:bdbf36b7f1f77ffe7bd2a32e59235dff6ecf131e3b6b5b96061c652f30685f3a", + "nginx:latest" + ], + "sizeBytes": 109258867 + }, + { + "names": [ + "debian@sha256:118cf8f3557e1ea766c02f36f05f6ac3e63628427ea8965fb861be904ec35a6f", + "debian:latest" + ], + "sizeBytes": 100594230 + }, + { + "names": [ + "nginx@sha256:e3456c851a152494c3e4ff5fcc26f240206abac0c9d794affb40e0714846c451", + "nginx:1.7.9" + ], + "sizeBytes": 91664166 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "aksrepos.azurecr.io/prod/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "deis/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "aksrepos.azurecr.io/mirror/kube-svc-redirect:v1.0.2", + "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2" + ], + "sizeBytes": 82897218 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/heapster-amd64@sha256:fc33c690a3a446de5abc24b048b88050810a58b9e4477fa763a43d7df029301a", + "k8s.gcr.io/heapster-amd64@sha256:fc33c690a3a446de5abc24b048b88050810a58b9e4477fa763a43d7df029301a", + "aksrepos.azurecr.io/mirror/heapster-amd64:v1.5.3", + "k8s.gcr.io/heapster-amd64:v1.5.3" + ], + "sizeBytes": 75318342 + }, + { + "names": [ + "vishiy/hello@sha256:99d60766e39df52d28fe8db9c659633d96ba1d84fd672298dce047d8a86c478a", + "vishiy/hello:err100eps" + ], + "sizeBytes": 54649865 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64@sha256:618a82fa66cf0c75e4753369a6999032372be7308866fc9afb381789b1e5ad52", + "k8s.gcr.io/k8s-dns-kube-dns-amd64@sha256:618a82fa66cf0c75e4753369a6999032372be7308866fc9afb381789b1e5ad52", + "aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64:1.14.13", + "k8s.gcr.io/k8s-dns-kube-dns-amd64:1.14.13" + ], + "sizeBytes": 51157394 + }, + { + "names": [ + "k8s.gcr.io/cluster-proportional-autoscaler-amd64@sha256:003f98d9f411ddfa6ff6d539196355e03ddd69fa4ed38c7ffb8fec6f729afe2d", + "k8s.gcr.io/cluster-proportional-autoscaler-amd64:1.1.2-r2" + ], + "sizeBytes": 49648481 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/metrics-server-amd64@sha256:220c0ed3451cb95e4b2f72dd5dc8d9d39d9f529722e5b29d8286373ce27b117e", + "k8s.gcr.io/metrics-server-amd64@sha256:49a9f12f7067d11f42c803dbe61ed2c1299959ad85cb315b25ff7eef8e6b8892", + "aksrepos.azurecr.io/mirror/metrics-server-amd64:v0.2.1", + "k8s.gcr.io/metrics-server-amd64:v0.2.1" + ], + "sizeBytes": 42541759 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64@sha256:4f1ab957f87b94a5ec1edc26fae50da2175461f00afecf68940c4aa079bd08a4", + "k8s.gcr.io/k8s-dns-sidecar-amd64@sha256:4f1ab957f87b94a5ec1edc26fae50da2175461f00afecf68940c4aa079bd08a4", + "aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64:1.14.10", + "k8s.gcr.io/k8s-dns-sidecar-amd64:1.14.10" + ], + "sizeBytes": 41635309 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64@sha256:bbb2a290a568125b3b996028958eb773f33b5b87a6b37bf38a28f8b62dddb3c8", + "k8s.gcr.io/k8s-dns-dnsmasq-nanny-amd64@sha256:bbb2a290a568125b3b996028958eb773f33b5b87a6b37bf38a28f8b62dddb3c8", + "aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64:1.14.10", + "k8s.gcr.io/k8s-dns-dnsmasq-nanny-amd64:1.14.10" + ], + "sizeBytes": 40372149 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/addon-resizer@sha256:8ac3ffa4232046feb297cefc40734641fa2954c16308f9e0d70ec152f22231ca", + "k8s.gcr.io/addon-resizer@sha256:507aa9845ecce1fdde4d61f530c802f4dc2974c700ce0db7730866e442db958d", + "aksrepos.azurecr.io/mirror/addon-resizer:1.8.1", + "k8s.gcr.io/addon-resizer:1.8.1" + ], + "sizeBytes": 32968591 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/nginx@sha256:91d22184f3f9b1be658c2cc2c12d324de7ff12c8b9c9a597905457b4d93b069d", + "nginx@sha256:9d46fd628d54ebe1633ee3cf0fe2acfcc419cfae541c63056530e39cd5620366", + "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "nginx:1.13.12-alpine" + ], + "sizeBytes": 18002931 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/exechealthz-amd64@sha256:34722333f0cd0b891b61c9e0efa31913f22157e341a3aabb79967305d4e78260", + "k8s.gcr.io/exechealthz-amd64@sha256:503e158c3f65ed7399f54010571c7c977ade7fe59010695f48d9650d83488c0a", + "aksrepos.azurecr.io/mirror/exechealthz-amd64:1.2", + "k8s.gcr.io/exechealthz-amd64:1.2" + ], + "sizeBytes": 8374840 + } + ], + "nodeInfo": { + "architecture": "amd64", + "bootID": "4c822e6d-c2e5-4697-9a01-467e04804fc1", + "containerRuntimeVersion": "docker://3.0.4", + "kernelVersion": "4.15.0-1037-azure", + "kubeProxyVersion": "v1.11.8", + "kubeletVersion": "v1.11.8", + "machineID": "1954026de5e6436788f214eb0dfd6a13", + "operatingSystem": "linux", + "osImage": "Ubuntu 16.04.5 LTS", + "systemUUID": "17A6A78E-D3E2-2A4F-852B-C91D933C8D5B" + } + } + }, + { + "apiVersion": "v1", + "kind": "Node", + "metadata": { + "annotations": { + "node.alpha.kubernetes.io/ttl": "0", + "volumes.kubernetes.io/controller-managed-attach-detach": "true" + }, + "creationTimestamp": "2019-06-21T02:01:53Z", + "labels": { + "agentpool": "nodepool1", + "beta.kubernetes.io/arch": "amd64", + "beta.kubernetes.io/instance-type": "Standard_DS1_v2", + "beta.kubernetes.io/os": "linux", + "failure-domain.beta.kubernetes.io/region": "eastus", + "failure-domain.beta.kubernetes.io/zone": "0", + "kubernetes.azure.com/cluster": "MC_dilipr-health-test_dilipr-health-test_eastus", + "kubernetes.io/hostname": "aks-nodepool1-19574989-2", + "kubernetes.io/role": "agent", + "node-role.kubernetes.io/agent": "", + "storageprofile": "managed", + "storagetier": "Premium_LRS" + }, + "name": "aks-nodepool1-19574989-2", + "resourceVersion": "19068101", + "selfLink": "/api/v1/nodes/aks-nodepool1-19574989-2", + "uid": "8a62e1bc-93c8-11e9-854d-ee76584a3c00" + }, + "spec": { + "podCIDR": "10.244.12.0/24", + "providerID": "azure:///subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/MC_dilipr-health-test_dilipr-health-test_eastus/providers/Microsoft.Compute/virtualMachines/aks-nodepool1-19574989-2" + }, + "status": { + "addresses": [ + { + "address": "aks-nodepool1-19574989-2", + "type": "Hostname" + }, + { + "address": "10.240.0.7", + "type": "InternalIP" + } + ], + "allocatable": { + "cpu": "940m", + "ephemeral-storage": "28043041951", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "2480548Ki", + "pods": "110" + }, + "capacity": { + "cpu": "1", + "ephemeral-storage": "30428648Ki", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "3500452Ki", + "pods": "110" + }, + "conditions": [ + { + "lastHeartbeatTime": "2019-06-21T02:02:24Z", + "lastTransitionTime": "2019-06-21T02:02:24Z", + "message": "RouteController created a route", + "reason": "RouteCreated", + "status": "False", + "type": "NetworkUnavailable" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:20Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet has sufficient disk space available", + "reason": "KubeletHasSufficientDisk", + "status": "False", + "type": "OutOfDisk" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:20Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet has sufficient memory available", + "reason": "KubeletHasSufficientMemory", + "status": "False", + "type": "MemoryPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:20Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet has no disk pressure", + "reason": "KubeletHasNoDiskPressure", + "status": "False", + "type": "DiskPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:20Z", + "lastTransitionTime": "2019-06-21T02:01:53Z", + "message": "kubelet has sufficient PID available", + "reason": "KubeletHasSufficientPID", + "status": "False", + "type": "PIDPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:20Z", + "lastTransitionTime": "2019-07-23T14:46:10Z", + "message": "kubelet is posting ready status. AppArmor enabled", + "reason": "KubeletReady", + "status": "True", + "type": "Ready" + } + ], + "daemonEndpoints": { + "kubeletEndpoint": { + "Port": 10250 + } + }, + "images": [ + { + "names": [ + "nickchase/rss-php-nginx@sha256:48da56a77fe4ecff4917121365d8e0ce615ebbdfe31f48a996255f5592894e2b", + "nickchase/rss-php-nginx:v1" + ], + "sizeBytes": 677038498 + }, + { + "names": [ + "rdilip83/jsonlogger@sha256:82b67ca5e0650cd5e47f5b51659d61cee035e5d8dcd8a79c50358cd2beb3b5a8", + "rdilip83/jsonlogger:v12" + ], + "sizeBytes": 676594134 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "k8s.gcr.io/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "k8s.gcr.io/hyperkube-amd64:v1.11.8" + ], + "sizeBytes": 615263658 + }, + { + "names": [ + "rdilip83/fixrubyerror@sha256:6b7f36cf6258b311015493ab025f06577d758c45bc5010d022ac160b9f40ea5d", + "rdilip83/fixrubyerror:latest" + ], + "sizeBytes": 494068028 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019" + ], + "sizeBytes": 494067935 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:fb2b90ce9bf7186fd9dfae97f5f72f9b9c80c8a0493af3cff74179cd4ff847c0", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08212019" + ], + "sizeBytes": 494067572 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c646e180483d295ffac114fb9df513db02553af7879681814d5910764653dd2d", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08202019" + ], + "sizeBytes": 494067210 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c21b596a22a1338ed293d01681f327acc871ee502ed779ec1109d6a93375bb3b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08192019" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "rdilip83/cifeatureprod08192019@sha256:7815bba9a805e4e8df33356fd532671de45525ce9c6e936e14f9b126e2097ecd", + "rdilip83/cifeatureprod08192019:v1" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:6387d0dedf4de0bab430f681ef61361f63a20e1c4c287a9b60ea5460283ac6cf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ci_feature_prod_health08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:59e34aab9f6e16a87e880b1ee1c9dd5434ee40dd29502e74aceefabf51443717", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:internaltesthealth08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/hc08192019@sha256:014d936771508d499ac4c15043e23b16bce8de0019fb2048b99540cbe9084895", + "rdilip83/hc08192019:1" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/health-rc@sha256:8ad12bce5ffd27b301bc6fe4355c8affa6fce080ae7e2291dec3a0ed11bb9483", + "rdilip83/health-rc:3" + ], + "sizeBytes": 494052863 + }, + { + "names": [ + "rdilip83/health_ci_feature_image@sha256:1a574d25884483083e8cbaacbf0cb7c4e442dc736d480615c65f5c71f8969b13", + "rdilip83/health_ci_feature_image:v1" + ], + "sizeBytes": 494052147 + }, + { + "names": [ + "rdilip83/healthrc@sha256:816c8cef09822daf050a0fca6f92e7ac19147ff4bf1a823d43fe70f73470cc0c", + "rdilip83/healthrc:v3" + ], + "sizeBytes": 494052138 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:d35aac044d1adc3d02269fde78f8dfd923db94b81288447cf6fdd482970a333b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthmerge08142019" + ], + "sizeBytes": 494052135 + }, + { + "names": [ + "rdilip83/healthrc@sha256:a130780e56ac0edb3ca29477e12edd5e9b5d08b5732dbd59ede9beb58e21eca7", + "rdilip83/healthrc:v2" + ], + "sizeBytes": 494051682 + }, + { + "names": [ + "rdilip83/healthmerge@sha256:24d270b0f59fb484c283922474736c3cba50f8aad0270bc0a3acd14284694eea", + "rdilip83/healthmerge:v8" + ], + "sizeBytes": 494010139 + }, + { + "names": [ + "rdilip83/health-rc@sha256:b1d24728eb808d301da426b76b7f7b79606204c4c2b695a24ac670be8276d55d", + "rdilip83/health-rc:1" + ], + "sizeBytes": 494000891 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:a0666957cccbfdf5784accd1133408bf017c28a6e694d9a2ae74da94eef2d285", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview08012019" + ], + "sizeBytes": 493994261 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:32c9b35a6809c54d5296e2ca2b122b35a4ad8c852622174cc5a9f92cc27e56e4", + "rdilip83/mergehealth:v3" + ], + "sizeBytes": 493988815 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:a3521e8f36e007b3cb949e0356a75394ac61fd2024ca1ec4827b8d54fb068534", + "rdilip83/mergehealth:v1" + ], + "sizeBytes": 493981585 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:0438e4690e042b195917e160b8949aeb339520ee19c898a8bb9452f36d1f84f1", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + ], + "sizeBytes": 493977357 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:9ebc410a36856176921dba81b5bd43132469209b315f52be346690435419b9bb" + ], + "sizeBytes": 493946790 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:4e51195a9c77bd166fc90ee5f6143a4604b502ab7ef0f06431dec10c341b10f3", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + ], + "sizeBytes": 493893635 + }, + { + "names": [ + "rdilip83/healthpreview06272019@sha256:d888ba5ff5e5810113a32f9c9812a5e28088cc81b902e95a185fe465a514029c", + "rdilip83/healthpreview06272019:latest" + ], + "sizeBytes": 493893633 + }, + { + "names": [ + "rdilip83/healthpreview06252019-1@sha256:1561876cffe94433a569f29f5231548e039193ebaa7ec640d22439675179e43f", + "rdilip83/healthpreview06252019-1:latest" + ], + "sizeBytes": 493887387 + }, + { + "names": [ + "rdilip83/healthpreview06252019@sha256:6597ff599a78ac452a4138dedb9e08c0ccd3e8b01594b033fd78ba9dbb41fe9e", + "rdilip83/healthpreview06252019:latest" + ], + "sizeBytes": 493887384 + }, + { + "names": [ + "rdilip83/healthpreview06242019@sha256:c4f565d92086d1ee56e6016178fed5c668352dc0ca0047f02910bdcb87a482c4", + "rdilip83/healthpreview06242019:latest" + ], + "sizeBytes": 493850850 + }, + { + "names": [ + "rdilip83/healthpreview06212019-1@sha256:937ce5801a0097a1cbc4eff5399c1973b4c6223ece9279b35207368b99f82b96", + "rdilip83/healthpreview06212019-1:latest" + ], + "sizeBytes": 493850674 + }, + { + "names": [ + "rdilip83/healthpreview06192019@sha256:f92cb5283814d446f0acde6a489648ea197496d5f85b27ca959ec97bce742d8a", + "rdilip83/healthpreview06192019:latest" + ], + "sizeBytes": 493799437 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:0f798cb7d56931b231f71e38e7fa5bf898b69e611247a566701f70a5f29a9799", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07092019" + ], + "sizeBytes": 467692116 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:3734a084fa9681c7e930eb90cad45a8f282c24af63065a720a2327b1683f3ba4", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142019" + ], + "sizeBytes": 466882569 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:16402c34e2d7de72c2ebc18ec8e9f7933fa25f6a7f83bceb84483ba95e3902f7", + "rdilip83/mergehealth:v2" + ], + "sizeBytes": 448931997 + }, + { + "names": [ + "rdilip83/healthpreview06212019@sha256:5860c9caaf544f2e7c46edad5cdfb69e22398e20dc87cb8a4cd630b5b7000074", + "rdilip83/healthpreview06212019:latest" + ], + "sizeBytes": 448366491 + }, + { + "names": [ + "deis/hcp-tunnel-front@sha256:68878ee3ea1781b322ea3952c3370e31dd89be8bb0864e2bf27bdba6dc904c41", + "deis/hcp-tunnel-front:v1.9.2-v4.0.7" + ], + "sizeBytes": 383483267 + }, + { + "names": [ + "progrium/stress@sha256:e34d56d60f5caae79333cee395aae93b74791d50e3841986420d23c2ee4697bf", + "progrium/stress:latest" + ], + "sizeBytes": 281783943 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:b6834bb69e8fad88110b1dc57097a45bc79e6f2c5f2c2773c871d07389794771", + "k8s.gcr.io/cluster-autoscaler:v1.12.3" + ], + "sizeBytes": 232229241 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:dc5744fd8c22aebfe40d6b62ab97d18d7bfbfc7ab1782509d69a5a9ec514df2c", + "k8s.gcr.io/cluster-autoscaler:v1.12.2" + ], + "sizeBytes": 232167833 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:e71851267764a068fbb091a4ef3bb874b5ce34db48cb757fcf77779f30ef0207", + "k8s.gcr.io/cluster-autoscaler:v1.3.7" + ], + "sizeBytes": 217353965 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:36a369ca4643542d501bce0addf8b903f2141ae9e2608662b77a3d24f01d7780", + "k8s.gcr.io/cluster-autoscaler:v1.2.2" + ], + "sizeBytes": 208688449 + }, + { + "names": [ + "containernetworking/azure-npm@sha256:4735da6dc0d5393d68be72498f5ce563cb930fa21b26faec8fdc844001057a56", + "containernetworking/azure-npm:v1.0.18" + ], + "sizeBytes": 170727162 + }, + { + "names": [ + "containernetworking/networkmonitor@sha256:d875511410502c3e37804e1f313cc2b0a03d7a03d3d5e6adaf8994b753a76f8e", + "containernetworking/networkmonitor:v0.0.6" + ], + "sizeBytes": 123663837 + }, + { + "names": [ + "containernetworking/networkmonitor@sha256:944408a497c451b0e79d2596dc2e9fe5036cdbba7fa831bff024e1c9ed44190d", + "containernetworking/networkmonitor:v0.0.5" + ], + "sizeBytes": 122043325 + }, + { + "names": [ + "nginx@sha256:bdbf36b7f1f77ffe7bd2a32e59235dff6ecf131e3b6b5b96061c652f30685f3a", + "nginx:latest" + ], + "sizeBytes": 109258867 + }, + { + "names": [ + "debian@sha256:118cf8f3557e1ea766c02f36f05f6ac3e63628427ea8965fb861be904ec35a6f", + "debian:latest" + ], + "sizeBytes": 100594230 + }, + { + "names": [ + "k8s.gcr.io/kube-addon-manager-amd64@sha256:3da3f17cd4f02fe5696f29a5e6cd4aef7111f20dab9bec54ea35942346cfeb60", + "k8s.gcr.io/kube-addon-manager-amd64:v8.8" + ], + "sizeBytes": 99631084 + }, + { + "names": [ + "k8s.gcr.io/kube-addon-manager-amd64@sha256:672794ee3582521eb8bc4f257d0f70c92893f1989f39a200f9c84bcfe1aea7c9", + "k8s.gcr.io/kube-addon-manager-amd64:v9.0" + ], + "sizeBytes": 83077558 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "aksrepos.azurecr.io/prod/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "deis/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "aksrepos.azurecr.io/mirror/kube-svc-redirect:v1.0.2", + "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2" + ], + "sizeBytes": 82897218 + }, + { + "names": [ + "k8s.gcr.io/heapster-amd64@sha256:dccaabb0c20cf05c29baefa1e9bf0358b083ccc0fab492b9b3b47fb7e4db5472", + "k8s.gcr.io/heapster-amd64:v1.5.4" + ], + "sizeBytes": 75318342 + } + ], + "nodeInfo": { + "architecture": "amd64", + "bootID": "ee529550-afa8-43bb-90a6-f157e7e22e18", + "containerRuntimeVersion": "docker://3.0.4", + "kernelVersion": "4.15.0-1045-azure", + "kubeProxyVersion": "v1.11.8", + "kubeletVersion": "v1.11.8", + "machineID": "0e5d932888da4e17a3c58210f6c8c9db", + "operatingSystem": "linux", + "osImage": "Ubuntu 16.04.6 LTS", + "systemUUID": "5DBFC273-947F-0140-AD1F-BF6758D30B37" + } + } + }, + { + "apiVersion": "v1", + "kind": "Node", + "metadata": { + "annotations": { + "node.alpha.kubernetes.io/ttl": "0", + "volumes.kubernetes.io/controller-managed-attach-detach": "true" + }, + "creationTimestamp": "2019-08-07T18:57:56Z", + "labels": { + "agentpool": "nodepool1", + "beta.kubernetes.io/arch": "amd64", + "beta.kubernetes.io/instance-type": "Standard_DS1_v2", + "beta.kubernetes.io/os": "linux", + "failure-domain.beta.kubernetes.io/region": "eastus", + "failure-domain.beta.kubernetes.io/zone": "1", + "kubernetes.azure.com/cluster": "MC_dilipr-health-test_dilipr-health-test_eastus", + "kubernetes.io/hostname": "aks-nodepool1-19574989-3", + "kubernetes.io/role": "agent", + "node-role.kubernetes.io/agent": "", + "storageprofile": "managed", + "storagetier": "Premium_LRS" + }, + "name": "aks-nodepool1-19574989-3", + "resourceVersion": "19068105", + "selfLink": "/api/v1/nodes/aks-nodepool1-19574989-3", + "uid": "448ea0a7-b945-11e9-a1b6-127094e7fd94" + }, + "spec": { + "podCIDR": "10.244.2.0/24", + "providerID": "azure:///subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/MC_dilipr-health-test_dilipr-health-test_eastus/providers/Microsoft.Compute/virtualMachines/aks-nodepool1-19574989-3" + }, + "status": { + "addresses": [ + { + "address": "aks-nodepool1-19574989-3", + "type": "Hostname" + }, + { + "address": "10.240.0.6", + "type": "InternalIP" + } + ], + "allocatable": { + "cpu": "940m", + "ephemeral-storage": "28043041951", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "2480544Ki", + "pods": "110" + }, + "capacity": { + "cpu": "1", + "ephemeral-storage": "30428648Ki", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "3500448Ki", + "pods": "110" + }, + "conditions": [ + { + "lastHeartbeatTime": "2019-08-07T18:59:32Z", + "lastTransitionTime": "2019-08-07T18:59:32Z", + "message": "RouteController created a route", + "reason": "RouteCreated", + "status": "False", + "type": "NetworkUnavailable" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-08-07T18:57:56Z", + "message": "kubelet has sufficient disk space available", + "reason": "KubeletHasSufficientDisk", + "status": "False", + "type": "OutOfDisk" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-08-07T18:57:56Z", + "message": "kubelet has sufficient memory available", + "reason": "KubeletHasSufficientMemory", + "status": "False", + "type": "MemoryPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-08-07T18:57:56Z", + "message": "kubelet has no disk pressure", + "reason": "KubeletHasNoDiskPressure", + "status": "False", + "type": "DiskPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-08-07T18:57:56Z", + "message": "kubelet has sufficient PID available", + "reason": "KubeletHasSufficientPID", + "status": "False", + "type": "PIDPressure" + }, + { + "lastHeartbeatTime": "2019-08-23T20:43:22Z", + "lastTransitionTime": "2019-08-07T18:58:06Z", + "message": "kubelet is posting ready status. AppArmor enabled", + "reason": "KubeletReady", + "status": "True", + "type": "Ready" + } + ], + "daemonEndpoints": { + "kubeletEndpoint": { + "Port": 10250 + } + }, + "images": [ + { + "names": [ + "deis/hcp-tunnel-front@sha256:a067679f0ab376197a344cd410821cf07d69fc322dcd9af4a9229250da725ce2", + "deis/hcp-tunnel-front:v1.9.2-v4.0.4" + ], + "sizeBytes": 640504769 + }, + { + "names": [ + "aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "k8s.gcr.io/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "k8s.gcr.io/hyperkube-amd64:v1.11.8" + ], + "sizeBytes": 615263658 + }, + { + "names": [ + "rdilip83/fixrubyerror@sha256:6b7f36cf6258b311015493ab025f06577d758c45bc5010d022ac160b9f40ea5d", + "rdilip83/fixrubyerror:latest" + ], + "sizeBytes": 494068028 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019" + ], + "sizeBytes": 494067935 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:fb2b90ce9bf7186fd9dfae97f5f72f9b9c80c8a0493af3cff74179cd4ff847c0", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08212019" + ], + "sizeBytes": 494067572 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c646e180483d295ffac114fb9df513db02553af7879681814d5910764653dd2d", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08202019" + ], + "sizeBytes": 494067210 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:c21b596a22a1338ed293d01681f327acc871ee502ed779ec1109d6a93375bb3b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08192019" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "rdilip83/cifeatureprod08192019@sha256:7815bba9a805e4e8df33356fd532671de45525ce9c6e936e14f9b126e2097ecd", + "rdilip83/cifeatureprod08192019:v1" + ], + "sizeBytes": 494055088 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:6387d0dedf4de0bab430f681ef61361f63a20e1c4c287a9b60ea5460283ac6cf", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ci_feature_prod_health08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/hc08192019@sha256:014d936771508d499ac4c15043e23b16bce8de0019fb2048b99540cbe9084895", + "rdilip83/hc08192019:1" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:59e34aab9f6e16a87e880b1ee1c9dd5434ee40dd29502e74aceefabf51443717", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:internaltesthealth08192019" + ], + "sizeBytes": 494053562 + }, + { + "names": [ + "rdilip83/health-rc@sha256:8ad12bce5ffd27b301bc6fe4355c8affa6fce080ae7e2291dec3a0ed11bb9483", + "rdilip83/health-rc:3" + ], + "sizeBytes": 494052863 + }, + { + "names": [ + "rdilip83/health_ci_feature_image@sha256:1a574d25884483083e8cbaacbf0cb7c4e442dc736d480615c65f5c71f8969b13", + "rdilip83/health_ci_feature_image:v1" + ], + "sizeBytes": 494052147 + }, + { + "names": [ + "rdilip83/healthrc@sha256:816c8cef09822daf050a0fca6f92e7ac19147ff4bf1a823d43fe70f73470cc0c", + "rdilip83/healthrc:v3" + ], + "sizeBytes": 494052138 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:d35aac044d1adc3d02269fde78f8dfd923db94b81288447cf6fdd482970a333b", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthmerge08142019" + ], + "sizeBytes": 494052135 + }, + { + "names": [ + "rdilip83/healthrc@sha256:a130780e56ac0edb3ca29477e12edd5e9b5d08b5732dbd59ede9beb58e21eca7", + "rdilip83/healthrc:v2" + ], + "sizeBytes": 494051682 + }, + { + "names": [ + "rdilip83/healthmerge@sha256:24d270b0f59fb484c283922474736c3cba50f8aad0270bc0a3acd14284694eea", + "rdilip83/healthmerge:v8" + ], + "sizeBytes": 494010139 + }, + { + "names": [ + "rdilip83/health-rc@sha256:b1d24728eb808d301da426b76b7f7b79606204c4c2b695a24ac670be8276d55d", + "rdilip83/health-rc:1" + ], + "sizeBytes": 494000891 + }, + { + "names": [ + "rdilip83/mergehealth@sha256:32c9b35a6809c54d5296e2ca2b122b35a4ad8c852622174cc5a9f92cc27e56e4", + "rdilip83/mergehealth:v3" + ], + "sizeBytes": 493988815 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:0438e4690e042b195917e160b8949aeb339520ee19c898a8bb9452f36d1f84f1", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + ], + "sizeBytes": 493977357 + }, + { + "names": [ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:4e51195a9c77bd166fc90ee5f6143a4604b502ab7ef0f06431dec10c341b10f3", + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + ], + "sizeBytes": 493893635 + }, + { + "names": [ + "rdilip83/healthpreview06272019@sha256:d888ba5ff5e5810113a32f9c9812a5e28088cc81b902e95a185fe465a514029c", + "rdilip83/healthpreview06272019:latest" + ], + "sizeBytes": 493893633 + }, + { + "names": [ + "aksrepos.azurecr.io/prod/hcp-tunnel-front@sha256:68878ee3ea1781b322ea3952c3370e31dd89be8bb0864e2bf27bdba6dc904c41", + "aksrepos.azurecr.io/prod/hcp-tunnel-front:v1.9.2-v4.0.7" + ], + "sizeBytes": 383483267 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:b6834bb69e8fad88110b1dc57097a45bc79e6f2c5f2c2773c871d07389794771", + "k8s.gcr.io/cluster-autoscaler:v1.12.3" + ], + "sizeBytes": 232229241 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:dc5744fd8c22aebfe40d6b62ab97d18d7bfbfc7ab1782509d69a5a9ec514df2c", + "k8s.gcr.io/cluster-autoscaler:v1.12.2" + ], + "sizeBytes": 232167833 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:193eaf37788dd5f971dd400b7e3d28e650bfd81c90fa46b234f03eb3d43880e3", + "k8s.gcr.io/cluster-autoscaler:v1.12.5" + ], + "sizeBytes": 231543459 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:e71851267764a068fbb091a4ef3bb874b5ce34db48cb757fcf77779f30ef0207", + "k8s.gcr.io/cluster-autoscaler:v1.3.7" + ], + "sizeBytes": 217353965 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:156b7b9bcba24ed474f67d0feaf27f2506013f15b030341bbd41c630283161b8", + "k8s.gcr.io/cluster-autoscaler:v1.3.4" + ], + "sizeBytes": 217264129 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:97896235bf66bde573d6f2ee150e212ea7010d314eb5d2cfb2ff1af93335db30", + "k8s.gcr.io/cluster-autoscaler:v1.3.3" + ], + "sizeBytes": 217259793 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:b416bf3b6687788b4da4c7ede2bcf067b34ad781862ee3d3dac1d720c5fa38b3", + "k8s.gcr.io/cluster-autoscaler:v1.3.9" + ], + "sizeBytes": 216696035 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:f37a2c84614bdd02475ccb020182caec562cde97fdfd9dae58de66ff89614bc5", + "k8s.gcr.io/cluster-autoscaler:v1.3.8" + ], + "sizeBytes": 216693526 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:b0777becbfc7a56e66b079d2767fdc173121a29165523bbbe309bcb2c0a226aa", + "k8s.gcr.io/cluster-autoscaler:v1.2.5" + ], + "sizeBytes": 212991966 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:36a369ca4643542d501bce0addf8b903f2141ae9e2608662b77a3d24f01d7780", + "k8s.gcr.io/cluster-autoscaler:v1.2.2" + ], + "sizeBytes": 208688449 + }, + { + "names": [ + "mcr.microsoft.com/containernetworking/azure-npm@sha256:7b9e7dec6b06a21595f9aa06b319c99b579950619fa869dd85dc637b2235d79f", + "mcr.microsoft.com/containernetworking/azure-npm:v1.0.18" + ], + "sizeBytes": 170727162 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:760232bed2097b5ca742f05b15c94d56ff96ed6b5c93251edc613be045c8d78b", + "k8s.gcr.io/cluster-autoscaler:v1.15.0" + ], + "sizeBytes": 152214996 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:a4e5a8e6d4dc011e6e7a104d6abdfda56274b90357ee9f6e42cc22b70482420b", + "k8s.gcr.io/cluster-autoscaler:v1.14.0" + ], + "sizeBytes": 142102721 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:cbc61e0f6c3ef1c591a0f22ec483826110e2c10acddd5415c0cc2305fd085e69", + "k8s.gcr.io/cluster-autoscaler:v1.14.2" + ], + "sizeBytes": 142099784 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:9dcbd91e79f33c44529de58a0024deb3da23a3a0bc7fd4d028c1255c68f62fb7", + "k8s.gcr.io/cluster-autoscaler:v1.13.2" + ], + "sizeBytes": 136684274 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:e4140dc3ab54e115ab4464331b25022fc5ffb947b568aaf81089efb72506c895", + "k8s.gcr.io/cluster-autoscaler:v1.13.4" + ], + "sizeBytes": 136681463 + }, + { + "names": [ + "k8s.gcr.io/cluster-autoscaler@sha256:7ff5a60304b344f2f29c804c7253632bbc818794f6932236a56db107a6a8f5af", + "k8s.gcr.io/cluster-autoscaler:v1.13.1" + ], + "sizeBytes": 136618018 + }, + { + "names": [ + "mcr.microsoft.com/containernetworking/networkmonitor@sha256:d875511410502c3e37804e1f313cc2b0a03d7a03d3d5e6adaf8994b753a76f8e", + "mcr.microsoft.com/containernetworking/networkmonitor:v0.0.6" + ], + "sizeBytes": 123663837 + }, + { + "names": [ + "mcr.microsoft.com/containernetworking/networkmonitor@sha256:944408a497c451b0e79d2596dc2e9fe5036cdbba7fa831bff024e1c9ed44190d", + "mcr.microsoft.com/containernetworking/networkmonitor:v0.0.5" + ], + "sizeBytes": 122043325 + }, + { + "names": [ + "k8s.gcr.io/kubernetes-dashboard-amd64@sha256:0ae6b69432e78069c5ce2bcde0fe409c5c4d6f0f4d9cd50a17974fea38898747", + "k8s.gcr.io/kubernetes-dashboard-amd64:v1.10.1" + ], + "sizeBytes": 121711221 + }, + { + "names": [ + "k8s.gcr.io/kube-addon-manager-amd64@sha256:3da3f17cd4f02fe5696f29a5e6cd4aef7111f20dab9bec54ea35942346cfeb60", + "k8s.gcr.io/kube-addon-manager-amd64:v8.8" + ], + "sizeBytes": 99631084 + }, + { + "names": [ + "k8s.gcr.io/kube-addon-manager-amd64@sha256:2fd1daf3d3cf0e94a753f2263b60dbb0d42b107b5cde0c75ee3fc5c830e016e4", + "k8s.gcr.io/kube-addon-manager-amd64:v8.9" + ], + "sizeBytes": 99240637 + }, + { + "names": [ + "microsoft/virtual-kubelet@sha256:efc397d741d7e590c892c0ea5dccc9a800656c3adb95da4dae25c1cdd5eb6d9f", + "microsoft/virtual-kubelet:latest" + ], + "sizeBytes": 87436458 + }, + { + "names": [ + "k8s.gcr.io/kube-addon-manager-amd64@sha256:672794ee3582521eb8bc4f257d0f70c92893f1989f39a200f9c84bcfe1aea7c9", + "k8s.gcr.io/kube-addon-manager-amd64:v9.0" + ], + "sizeBytes": 83077558 + }, + { + "names": [ + "k8s.gcr.io/kube-addon-manager-amd64@sha256:382c220b3531d9f95bf316a16b7282cc2ef929cd8a89a9dd3f5933edafc41a8e", + "k8s.gcr.io/kube-addon-manager-amd64:v9.0.1" + ], + "sizeBytes": 83076194 + }, + { + "names": [ + "aksrepos.azurecr.io/prod/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "deis/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2", + "deis/kube-svc-redirect:v1.0.2" + ], + "sizeBytes": 82897218 + }, + { + "names": [ + "k8s.gcr.io/kube-addon-manager-amd64@sha256:3519273916ba45cfc9b318448d4629819cb5fbccbb0822cce054dd8c1f68cb60", + "k8s.gcr.io/kube-addon-manager-amd64:v8.6" + ], + "sizeBytes": 78384272 + } + ], + "nodeInfo": { + "architecture": "amd64", + "bootID": "47e7c02b-3741-42be-a2a1-76c76aa8ccde", + "containerRuntimeVersion": "docker://3.0.6", + "kernelVersion": "4.15.0-1050-azure", + "kubeProxyVersion": "v1.11.8", + "kubeletVersion": "v1.11.8", + "machineID": "a4a4bc2f5a944cd38aba89365df05227", + "operatingSystem": "linux", + "osImage": "Ubuntu 16.04.6 LTS", + "systemUUID": "BB102B43-9922-264C-8C23-22A7DE0F950F" + } + } + } + ], + "kind": "List", + "metadata": { + "resourceVersion": "", + "selfLink": "" + } +} diff --git a/test/code/plugin/health/parent_monitor_provider_spec.rb b/test/code/plugin/health/parent_monitor_provider_spec.rb index a83db50fc..b531629c4 100644 --- a/test/code/plugin/health/parent_monitor_provider_spec.rb +++ b/test/code/plugin/health/parent_monitor_provider_spec.rb @@ -97,7 +97,7 @@ def monitor.labels; {HealthMonitorLabels::ROLE => "master"}; end assert_equal parent_id, "master_node_pool" end - it 'raises if conditions are not met' do + it 'returns defaultParentMonitorTypeId if conditions are not met' do #arrange definition = JSON.parse('{"conditional_monitor_id": { "conditions": [ @@ -123,6 +123,7 @@ def monitor.labels; {HealthMonitorLabels::ROLE => "master"}; end "container.azm.ms/cluster-resource-group", "container.azm.ms/cluster-name" ], + "default_parent_monitor_id": "default_parent_monitor_id", "aggregation_algorithm": "worstOf", "aggregation_algorithm_params": null } @@ -137,8 +138,9 @@ def monitor.monitor_instance_id; "conditional_monitor_instance_id"; end def monitor.labels; {HealthMonitorLabels::ROLE => "master1"}; end #act and assert - assert_raises do - parent_id = health_model_definition.get_parent_monitor_id(monitor) - end + + parent_id = health_model_definition.get_parent_monitor_id(monitor) + parent_id.must_equal('default_parent_monitor_id') + end end diff --git a/test/code/plugin/health/pods.json b/test/code/plugin/health/pods.json new file mode 100644 index 000000000..b7c202a19 --- /dev/null +++ b/test/code/plugin/health/pods.json @@ -0,0 +1,5987 @@ +{ + "apiVersion": "v1", + "items": [ + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-08-23T17:12:10Z", + "generateName": "heapster-9bcbfdcf5-", + "labels": { + "k8s-app": "heapster", + "pod-template-hash": "567698791" + }, + "name": "heapster-9bcbfdcf5-zp9tl", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "heapster-9bcbfdcf5", + "uid": "24a0036e-c5c9-11e9-8736-86290fd7dd1f" + } + ], + "resourceVersion": "19048925", + "selfLink": "/api/v1/namespaces/kube-system/pods/heapster-9bcbfdcf5-zp9tl", + "uid": "24ab7e32-c5c9-11e9-8736-86290fd7dd1f" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/heapster", + "--source=kubernetes.summary_api:\"\"" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/heapster-amd64:v1.5.3", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/healthz", + "port": 8082, + "scheme": "HTTP" + }, + "initialDelaySeconds": 180, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "heapster", + "resources": { + "limits": { + "cpu": "88m", + "memory": "204Mi" + }, + "requests": { + "cpu": "88m", + "memory": "204Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "heapster-token-7z7c5", + "readOnly": true + } + ] + }, + { + "command": [ + "/pod_nanny", + "--config-dir=/etc/config", + "--cpu=80m", + "--extra-cpu=0.5m", + "--memory=140Mi", + "--extra-memory=4Mi", + "--threshold=5", + "--deployment=heapster", + "--container=heapster", + "--poll-period=300000", + "--estimator=exponential" + ], + "env": [ + { + "name": "MY_POD_NAME", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "metadata.name" + } + } + }, + { + "name": "MY_POD_NAMESPACE", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "metadata.namespace" + } + } + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/addon-resizer:1.8.1", + "imagePullPolicy": "IfNotPresent", + "name": "heapster-nanny", + "resources": { + "limits": { + "cpu": "50m", + "memory": "90Mi" + }, + "requests": { + "cpu": "50m", + "memory": "90Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/config", + "name": "heapster-config-volume" + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "heapster-token-7z7c5", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-0", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "heapster", + "serviceAccountName": "heapster", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "heapster-config" + }, + "name": "heapster-config-volume" + }, + { + "name": "heapster-token-7z7c5", + "secret": { + "defaultMode": 420, + "secretName": "heapster-token-7z7c5" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:12:10Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:12:26Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:12:10Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://8ab1ee82d29d0351cb21dbce4db9eb2a270407d2ebe10377be02edd46cb34027", + "image": "aksrepos.azurecr.io/mirror/heapster-amd64:v1.5.3", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/heapster-amd64@sha256:fc33c690a3a446de5abc24b048b88050810a58b9e4477fa763a43d7df029301a", + "lastState": {}, + "name": "heapster", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T17:12:21Z" + } + } + }, + { + "containerID": "docker://42154ff41fed196c3f4b8a485436537330d16bcef23c743a34cf63202d023453", + "image": "aksrepos.azurecr.io/mirror/addon-resizer:1.8.1", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/addon-resizer@sha256:8ac3ffa4232046feb297cefc40734641fa2954c16308f9e0d70ec152f22231ca", + "lastState": {}, + "name": "heapster-nanny", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T17:12:25Z" + } + } + } + ], + "hostIP": "10.240.0.4", + "phase": "Running", + "podIP": "10.244.1.33", + "qosClass": "Guaranteed", + "startTime": "2019-08-23T17:12:10Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "scheduler.alpha.kubernetes.io/critical-pod": "", + "seccomp.security.alpha.kubernetes.io/pod": "docker/default" + }, + "creationTimestamp": "2019-07-09T02:38:06Z", + "generateName": "kube-dns-autoscaler-7d64798d95-", + "labels": { + "k8s-app": "kube-dns-autoscaler", + "pod-template-hash": "3820354851" + }, + "name": "kube-dns-autoscaler-7d64798d95-f9wcv", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "kube-dns-autoscaler-7d64798d95", + "uid": "71655f71-a1f2-11e9-9bc6-127bb0ec03b8" + } + ], + "resourceVersion": "15144041", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-dns-autoscaler-7d64798d95-f9wcv", + "uid": "94e52ab1-a1f2-11e9-8b08-d602e29755d5" + }, + "spec": { + "containers": [ + { + "command": [ + "/cluster-proportional-autoscaler", + "--namespace=kube-system", + "--configmap=kube-dns-autoscaler", + "--target=deployment/kube-dns-v20", + "--default-params={\"ladder\":{\"coresToReplicas\":[[1,2],[512,3],[1024,4],[2048,5]],\"nodesToReplicas\":[[1,2],[8,3],[16,4],[32,5]]}}", + "--logtostderr=true", + "--v=2" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/cluster-proportional-autoscaler-amd64:1.1.2-r2", + "imagePullPolicy": "IfNotPresent", + "name": "autoscaler", + "resources": { + "requests": { + "cpu": "20m", + "memory": "10Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-autoscaler-token-zkxt8", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "Default", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-2", + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-dns-autoscaler", + "serviceAccountName": "kube-dns-autoscaler", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "name": "kube-dns-autoscaler-token-zkxt8", + "secret": { + "defaultMode": 420, + "secretName": "kube-dns-autoscaler-token-zkxt8" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:07Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:44Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:06Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://43f5fde3ce0f375a40c08de56087fc3b53f6269b239a3e6383d2082779504b96", + "image": "aksrepos.azurecr.io/mirror/cluster-proportional-autoscaler-amd64:1.1.2-r2", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/cluster-proportional-autoscaler-amd64@sha256:ccd2b031b116750091443930a8e6d0f785cfde38f137969e472b2ac850aeddfb", + "lastState": {}, + "name": "autoscaler", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:40Z" + } + } + } + ], + "hostIP": "10.240.0.7", + "phase": "Running", + "podIP": "10.244.12.118", + "qosClass": "Burstable", + "startTime": "2019-07-09T02:38:07Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "prometheus.io/port": "10055", + "prometheus.io/scrape": "true" + }, + "creationTimestamp": "2019-07-09T02:38:06Z", + "generateName": "kube-dns-v20-55cb5d96f7-", + "labels": { + "k8s-app": "kube-dns", + "kubernetes.io/cluster-service": "true", + "pod-template-hash": "1176185293", + "version": "v20" + }, + "name": "kube-dns-v20-55cb5d96f7-lmrpl", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "kube-dns-v20-55cb5d96f7", + "uid": "71892fd6-a1f2-11e9-9bc6-127bb0ec03b8" + } + ], + "resourceVersion": "15144030", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-dns-v20-55cb5d96f7-lmrpl", + "uid": "952488f3-a1f2-11e9-8b08-d602e29755d5" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + }, + "podAntiAffinity": { + "preferredDuringSchedulingIgnoredDuringExecution": [ + { + "podAffinityTerm": { + "labelSelector": { + "matchExpressions": [ + { + "key": "k8s-app", + "operator": "In", + "values": [ + "kube-dns" + ] + } + ] + }, + "topologyKey": "kubernetes.io/hostname" + }, + "weight": 100 + } + ] + } + }, + "containers": [ + { + "args": [ + "--kubecfg-file=/config/kubeconfig", + "--config-dir=/kube-dns-config", + "--domain=cluster.local.", + "--dns-port=10053", + "--v=2" + ], + "env": [ + { + "name": "PROMETHEUS_PORT", + "value": "10055" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64:1.14.13", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 5, + "httpGet": { + "path": "/healthcheck/kubedns", + "port": 10054, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "kubedns", + "ports": [ + { + "containerPort": 10053, + "name": "dns-local", + "protocol": "UDP" + }, + { + "containerPort": 10053, + "name": "dns-tcp-local", + "protocol": "TCP" + }, + { + "containerPort": 10055, + "name": "metrics", + "protocol": "TCP" + } + ], + "readinessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/readiness", + "port": 8081, + "scheme": "HTTP" + }, + "initialDelaySeconds": 30, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "resources": { + "limits": { + "memory": "170Mi" + }, + "requests": { + "cpu": "100m", + "memory": "70Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/kube-dns-config", + "name": "kube-dns-config" + }, + { + "mountPath": "/config", + "name": "kubedns-kubecfg", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + }, + { + "args": [ + "-v=2", + "-logtostderr", + "-configDir=/kube-dns-config", + "-restartDnsmasq=true", + "--", + "-k", + "--cache-size=1000", + "--no-negcache", + "--no-resolv", + "--server=127.0.0.1#10053", + "--server=/cluster.local/127.0.0.1#10053", + "--server=/in-addr.arpa/127.0.0.1#10053", + "--server=/ip6.arpa/127.0.0.1#10053", + "--log-facility=-" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64:1.14.10", + "imagePullPolicy": "IfNotPresent", + "name": "dnsmasq", + "ports": [ + { + "containerPort": 53, + "name": "dns", + "protocol": "UDP" + }, + { + "containerPort": 53, + "name": "dns-tcp", + "protocol": "TCP" + } + ], + "resources": {}, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/kube-dns-config", + "name": "kube-dns-config" + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + }, + { + "args": [ + "--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1 \u003e/dev/null || exit 1; done", + "--url=/healthz-dnsmasq", + "--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1:10053 \u003e/dev/null || exit 1; done", + "--url=/healthz-kubedns", + "--port=8080", + "--quiet" + ], + "env": [ + { + "name": "PROBE_DOMAINS", + "value": "bing.com kubernetes.default.svc.cluster.local" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/exechealthz-amd64:1.2", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 5, + "httpGet": { + "path": "/healthz-dnsmasq", + "port": 8080, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "healthz", + "ports": [ + { + "containerPort": 8080, + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "memory": "50Mi" + }, + "requests": { + "cpu": "10m", + "memory": "50Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + }, + { + "args": [ + "--v=2", + "--logtostderr", + "--probe=kubedns,127.0.0.1:10053,kubernetes.default.svc.cluster.local,5,SRV", + "--probe=dnsmasq,127.0.0.1:53,kubernetes.default.svc.cluster.local,5,SRV" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64:1.14.10", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/metrics", + "port": 10054, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "sidecar", + "ports": [ + { + "containerPort": 10054, + "name": "metrics", + "protocol": "TCP" + } + ], + "resources": { + "requests": { + "cpu": "10m", + "memory": "20Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "Default", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-1", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-dns", + "serviceAccountName": "kube-dns", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "kube-dns", + "optional": true + }, + "name": "kube-dns-config" + }, + { + "configMap": { + "defaultMode": 420, + "name": "kubedns-kubecfg" + }, + "name": "kubedns-kubecfg" + }, + { + "name": "kube-dns-token-ghgtl", + "secret": { + "defaultMode": 420, + "secretName": "kube-dns-token-ghgtl" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:09Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:50Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:06Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://8aa7d794d423f29469d8a35cc295bfaf2434a26756d7063fb19e06ce838aa5d9", + "image": "aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64:1.14.10", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64@sha256:bbb2a290a568125b3b996028958eb773f33b5b87a6b37bf38a28f8b62dddb3c8", + "lastState": {}, + "name": "dnsmasq", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:23Z" + } + } + }, + { + "containerID": "docker://7ee72258ca97555017c3096c3c125935b22e1735dafd494bec7f5480a408314a", + "image": "aksrepos.azurecr.io/mirror/exechealthz-amd64:1.2", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/exechealthz-amd64@sha256:34722333f0cd0b891b61c9e0efa31913f22157e341a3aabb79967305d4e78260", + "lastState": {}, + "name": "healthz", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:25Z" + } + } + }, + { + "containerID": "docker://bf6c7e823d08306e6ba13353ae89319080990a5d302b1d7370e76acd34c34a52", + "image": "aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64:1.14.13", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64@sha256:618a82fa66cf0c75e4753369a6999032372be7308866fc9afb381789b1e5ad52", + "lastState": {}, + "name": "kubedns", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:19Z" + } + } + }, + { + "containerID": "docker://2e4faf4da65a23316dc7065e3de27bf1ebd9ac2a8f07b9053de5ab63ab4c2d7e", + "image": "aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64:1.14.10", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64@sha256:4f1ab957f87b94a5ec1edc26fae50da2175461f00afecf68940c4aa079bd08a4", + "lastState": {}, + "name": "sidecar", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:28Z" + } + } + } + ], + "hostIP": "10.240.0.5", + "phase": "Running", + "podIP": "10.244.0.192", + "qosClass": "Burstable", + "startTime": "2019-07-09T02:38:09Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "prometheus.io/port": "10055", + "prometheus.io/scrape": "true" + }, + "creationTimestamp": "2019-07-09T02:38:06Z", + "generateName": "kube-dns-v20-55cb5d96f7-", + "labels": { + "k8s-app": "kube-dns", + "kubernetes.io/cluster-service": "true", + "pod-template-hash": "1176185293", + "version": "v20" + }, + "name": "kube-dns-v20-55cb5d96f7-pl7sh", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "kube-dns-v20-55cb5d96f7", + "uid": "71892fd6-a1f2-11e9-9bc6-127bb0ec03b8" + } + ], + "resourceVersion": "15144050", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-dns-v20-55cb5d96f7-pl7sh", + "uid": "95046bc6-a1f2-11e9-8b08-d602e29755d5" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + }, + "podAntiAffinity": { + "preferredDuringSchedulingIgnoredDuringExecution": [ + { + "podAffinityTerm": { + "labelSelector": { + "matchExpressions": [ + { + "key": "k8s-app", + "operator": "In", + "values": [ + "kube-dns" + ] + } + ] + }, + "topologyKey": "kubernetes.io/hostname" + }, + "weight": 100 + } + ] + } + }, + "containers": [ + { + "args": [ + "--kubecfg-file=/config/kubeconfig", + "--config-dir=/kube-dns-config", + "--domain=cluster.local.", + "--dns-port=10053", + "--v=2" + ], + "env": [ + { + "name": "PROMETHEUS_PORT", + "value": "10055" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64:1.14.13", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 5, + "httpGet": { + "path": "/healthcheck/kubedns", + "port": 10054, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "kubedns", + "ports": [ + { + "containerPort": 10053, + "name": "dns-local", + "protocol": "UDP" + }, + { + "containerPort": 10053, + "name": "dns-tcp-local", + "protocol": "TCP" + }, + { + "containerPort": 10055, + "name": "metrics", + "protocol": "TCP" + } + ], + "readinessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/readiness", + "port": 8081, + "scheme": "HTTP" + }, + "initialDelaySeconds": 30, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "resources": { + "limits": { + "memory": "170Mi" + }, + "requests": { + "cpu": "100m", + "memory": "70Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/kube-dns-config", + "name": "kube-dns-config" + }, + { + "mountPath": "/config", + "name": "kubedns-kubecfg", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + }, + { + "args": [ + "-v=2", + "-logtostderr", + "-configDir=/kube-dns-config", + "-restartDnsmasq=true", + "--", + "-k", + "--cache-size=1000", + "--no-negcache", + "--no-resolv", + "--server=127.0.0.1#10053", + "--server=/cluster.local/127.0.0.1#10053", + "--server=/in-addr.arpa/127.0.0.1#10053", + "--server=/ip6.arpa/127.0.0.1#10053", + "--log-facility=-" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64:1.14.10", + "imagePullPolicy": "IfNotPresent", + "name": "dnsmasq", + "ports": [ + { + "containerPort": 53, + "name": "dns", + "protocol": "UDP" + }, + { + "containerPort": 53, + "name": "dns-tcp", + "protocol": "TCP" + } + ], + "resources": {}, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/kube-dns-config", + "name": "kube-dns-config" + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + }, + { + "args": [ + "--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1 \u003e/dev/null || exit 1; done", + "--url=/healthz-dnsmasq", + "--cmd=for d in $PROBE_DOMAINS; do nslookup $d 127.0.0.1:10053 \u003e/dev/null || exit 1; done", + "--url=/healthz-kubedns", + "--port=8080", + "--quiet" + ], + "env": [ + { + "name": "PROBE_DOMAINS", + "value": "bing.com kubernetes.default.svc.cluster.local" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/exechealthz-amd64:1.2", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 5, + "httpGet": { + "path": "/healthz-dnsmasq", + "port": 8080, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "healthz", + "ports": [ + { + "containerPort": 8080, + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "memory": "50Mi" + }, + "requests": { + "cpu": "10m", + "memory": "50Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + }, + { + "args": [ + "--v=2", + "--logtostderr", + "--probe=kubedns,127.0.0.1:10053,kubernetes.default.svc.cluster.local,5,SRV", + "--probe=dnsmasq,127.0.0.1:53,kubernetes.default.svc.cluster.local,5,SRV" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64:1.14.10", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/metrics", + "port": 10054, + "scheme": "HTTP" + }, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 5 + }, + "name": "sidecar", + "ports": [ + { + "containerPort": 10054, + "name": "metrics", + "protocol": "TCP" + } + ], + "resources": { + "requests": { + "cpu": "10m", + "memory": "20Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-dns-token-ghgtl", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "Default", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-2", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-dns", + "serviceAccountName": "kube-dns", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "kube-dns", + "optional": true + }, + "name": "kube-dns-config" + }, + { + "configMap": { + "defaultMode": 420, + "name": "kubedns-kubecfg" + }, + "name": "kubedns-kubecfg" + }, + { + "name": "kube-dns-token-ghgtl", + "secret": { + "defaultMode": 420, + "secretName": "kube-dns-token-ghgtl" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:10Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:39:14Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:06Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://c16dce3b5c1f06c6fbfdf52edb98f9916740c0f652dc72b2fe0f9f0cc5c4c4de", + "image": "aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64:1.14.10", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/k8s-dns-dnsmasq-nanny-amd64@sha256:bbb2a290a568125b3b996028958eb773f33b5b87a6b37bf38a28f8b62dddb3c8", + "lastState": {}, + "name": "dnsmasq", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:51Z" + } + } + }, + { + "containerID": "docker://410ceb88fcbc2c3cdf19ffc5ce88adb0ba933bbc3cf446a90e669a978a7d933c", + "image": "aksrepos.azurecr.io/mirror/exechealthz-amd64:1.2", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/exechealthz-amd64@sha256:34722333f0cd0b891b61c9e0efa31913f22157e341a3aabb79967305d4e78260", + "lastState": {}, + "name": "healthz", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:58Z" + } + } + }, + { + "containerID": "docker://694f575606b51234a98b3e22d2afd04f3fa11c30b6090a901e64922eeb9fba95", + "image": "aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64:1.14.13", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/k8s-dns-kube-dns-amd64@sha256:618a82fa66cf0c75e4753369a6999032372be7308866fc9afb381789b1e5ad52", + "lastState": {}, + "name": "kubedns", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:39Z" + } + } + }, + { + "containerID": "docker://d7865fb7465b2f9cd218cdf6694018aee55260966f2bf51e6b628a86c6b9041f", + "image": "aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64:1.14.10", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/k8s-dns-sidecar-amd64@sha256:4f1ab957f87b94a5ec1edc26fae50da2175461f00afecf68940c4aa079bd08a4", + "lastState": {}, + "name": "sidecar", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:39:04Z" + } + } + } + ], + "hostIP": "10.240.0.7", + "phase": "Running", + "podIP": "10.244.12.117", + "qosClass": "Burstable", + "startTime": "2019-07-09T02:38:10Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "aks.microsoft.com/release-time": "seconds:1566580134 nanos:758740921 ", + "remediator.aks.microsoft.com/kube-proxy-restart": "24" + }, + "creationTimestamp": "2019-08-23T17:13:13Z", + "generateName": "kube-proxy-", + "labels": { + "component": "kube-proxy", + "controller-revision-hash": "3559350992", + "pod-template-generation": "141", + "tier": "node" + }, + "name": "kube-proxy-ct2tl", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-proxy", + "uid": "45640bf6-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "19049034", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-proxy-ct2tl", + "uid": "49e373c8-c5c9-11e9-8736-86290fd7dd1f" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/hyperkube", + "proxy", + "--kubeconfig=/var/lib/kubelet/kubeconfig", + "--cluster-cidr=10.244.0.0/16", + "--feature-gates=ExperimentalCriticalPodAnnotation=true", + "--v=3" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imagePullPolicy": "IfNotPresent", + "name": "kube-proxy", + "resources": { + "requests": { + "cpu": "100m" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/lib/kubelet", + "name": "kubeconfig", + "readOnly": true + }, + { + "mountPath": "/etc/kubernetes/certs", + "name": "certificates", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-proxy-token-f5vbg", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-0", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-proxy", + "serviceAccountName": "kube-proxy", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/var/lib/kubelet", + "type": "" + }, + "name": "kubeconfig" + }, + { + "hostPath": { + "path": "/etc/kubernetes/certs", + "type": "" + }, + "name": "certificates" + }, + { + "name": "kube-proxy-token-f5vbg", + "secret": { + "defaultMode": 420, + "secretName": "kube-proxy-token-f5vbg" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:13:13Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:13:23Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:13:13Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://ef115b31792ece39d1526075f9f3763f8cbf526814624795a05786d83367427e", + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "lastState": {}, + "name": "kube-proxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T17:13:22Z" + } + } + } + ], + "hostIP": "10.240.0.4", + "phase": "Running", + "podIP": "10.240.0.4", + "qosClass": "Burstable", + "startTime": "2019-08-23T17:13:13Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "aks.microsoft.com/release-time": "seconds:1566580134 nanos:758740921 ", + "remediator.aks.microsoft.com/kube-proxy-restart": "24" + }, + "creationTimestamp": "2019-08-23T17:10:52Z", + "generateName": "kube-proxy-", + "labels": { + "component": "kube-proxy", + "controller-revision-hash": "3559350992", + "pod-template-generation": "141", + "tier": "node" + }, + "name": "kube-proxy-d59xd", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-proxy", + "uid": "45640bf6-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "19048698", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-proxy-d59xd", + "uid": "f65e6a62-c5c8-11e9-8736-86290fd7dd1f" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/hyperkube", + "proxy", + "--kubeconfig=/var/lib/kubelet/kubeconfig", + "--cluster-cidr=10.244.0.0/16", + "--feature-gates=ExperimentalCriticalPodAnnotation=true", + "--v=3" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imagePullPolicy": "IfNotPresent", + "name": "kube-proxy", + "resources": { + "requests": { + "cpu": "100m" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/lib/kubelet", + "name": "kubeconfig", + "readOnly": true + }, + { + "mountPath": "/etc/kubernetes/certs", + "name": "certificates", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-proxy-token-f5vbg", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-1", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-proxy", + "serviceAccountName": "kube-proxy", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/var/lib/kubelet", + "type": "" + }, + "name": "kubeconfig" + }, + { + "hostPath": { + "path": "/etc/kubernetes/certs", + "type": "" + }, + "name": "certificates" + }, + { + "name": "kube-proxy-token-f5vbg", + "secret": { + "defaultMode": 420, + "secretName": "kube-proxy-token-f5vbg" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:10:52Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:11:05Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:10:52Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://c4e9d0e372116b9cab048f7bb381e93b423dac2285da75f66664a473fcc043b3", + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "lastState": {}, + "name": "kube-proxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T17:11:04Z" + } + } + } + ], + "hostIP": "10.240.0.5", + "phase": "Running", + "podIP": "10.240.0.5", + "qosClass": "Burstable", + "startTime": "2019-08-23T17:10:52Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "aks.microsoft.com/release-time": "seconds:1566580134 nanos:758740921 ", + "remediator.aks.microsoft.com/kube-proxy-restart": "24" + }, + "creationTimestamp": "2019-08-23T17:12:23Z", + "generateName": "kube-proxy-", + "labels": { + "component": "kube-proxy", + "controller-revision-hash": "3559350992", + "pod-template-generation": "141", + "tier": "node" + }, + "name": "kube-proxy-kpm8j", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-proxy", + "uid": "45640bf6-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "19048942", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-proxy-kpm8j", + "uid": "2c3de48d-c5c9-11e9-8736-86290fd7dd1f" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/hyperkube", + "proxy", + "--kubeconfig=/var/lib/kubelet/kubeconfig", + "--cluster-cidr=10.244.0.0/16", + "--feature-gates=ExperimentalCriticalPodAnnotation=true", + "--v=3" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imagePullPolicy": "IfNotPresent", + "name": "kube-proxy", + "resources": { + "requests": { + "cpu": "100m" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/lib/kubelet", + "name": "kubeconfig", + "readOnly": true + }, + { + "mountPath": "/etc/kubernetes/certs", + "name": "certificates", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-proxy-token-f5vbg", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-2", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-proxy", + "serviceAccountName": "kube-proxy", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/var/lib/kubelet", + "type": "" + }, + "name": "kubeconfig" + }, + { + "hostPath": { + "path": "/etc/kubernetes/certs", + "type": "" + }, + "name": "certificates" + }, + { + "name": "kube-proxy-token-f5vbg", + "secret": { + "defaultMode": 420, + "secretName": "kube-proxy-token-f5vbg" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:12:24Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:12:34Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:12:24Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://51067a965113e6d285a676e0d1e212ffbb60046aab6c4702f5554617415b2031", + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "lastState": {}, + "name": "kube-proxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T17:12:33Z" + } + } + } + ], + "hostIP": "10.240.0.7", + "phase": "Running", + "podIP": "10.240.0.7", + "qosClass": "Burstable", + "startTime": "2019-08-23T17:12:24Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "aks.microsoft.com/release-time": "seconds:1566580134 nanos:758740921 ", + "remediator.aks.microsoft.com/kube-proxy-restart": "24" + }, + "creationTimestamp": "2019-08-23T17:11:38Z", + "generateName": "kube-proxy-", + "labels": { + "component": "kube-proxy", + "controller-revision-hash": "3559350992", + "pod-template-generation": "141", + "tier": "node" + }, + "name": "kube-proxy-skzg4", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-proxy", + "uid": "45640bf6-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "19048774", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-proxy-skzg4", + "uid": "114f7246-c5c9-11e9-8736-86290fd7dd1f" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/hyperkube", + "proxy", + "--kubeconfig=/var/lib/kubelet/kubeconfig", + "--cluster-cidr=10.244.0.0/16", + "--feature-gates=ExperimentalCriticalPodAnnotation=true", + "--v=3" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imagePullPolicy": "IfNotPresent", + "name": "kube-proxy", + "resources": { + "requests": { + "cpu": "100m" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/lib/kubelet", + "name": "kubeconfig", + "readOnly": true + }, + { + "mountPath": "/etc/kubernetes/certs", + "name": "certificates", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-proxy-token-f5vbg", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-3", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-proxy", + "serviceAccountName": "kube-proxy", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/var/lib/kubelet", + "type": "" + }, + "name": "kubeconfig" + }, + { + "hostPath": { + "path": "/etc/kubernetes/certs", + "type": "" + }, + "name": "certificates" + }, + { + "name": "kube-proxy-token-f5vbg", + "secret": { + "defaultMode": 420, + "secretName": "kube-proxy-token-f5vbg" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:11:38Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:11:42Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T17:11:38Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://a3172e9191547b0ea3eb7db629cd4bba2240f5c9d0186ea37be49d9877034541", + "image": "aksrepos.azurecr.io/mirror/hyperkube-amd64:v1.11.8", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/hyperkube-amd64@sha256:1447d5b491fcee503c9f8fb712e1593dc3772c7e661251f54c297477cc716913", + "lastState": {}, + "name": "kube-proxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T17:11:41Z" + } + } + } + ], + "hostIP": "10.240.0.6", + "phase": "Running", + "podIP": "10.240.0.6", + "qosClass": "Burstable", + "startTime": "2019-08-23T17:11:38Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-07-15T20:54:26Z", + "generateName": "kube-svc-redirect-", + "labels": { + "component": "kube-svc-redirect", + "controller-revision-hash": "1216437240", + "pod-template-generation": "9", + "tier": "node" + }, + "name": "kube-svc-redirect-czm8d", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-svc-redirect", + "uid": "45a5fc62-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "15831523", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-svc-redirect-czm8d", + "uid": "bb3d3ef2-a742-11e9-a38a-22d1c75c4357" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "env": [ + { + "name": "KUBERNETES_SVC_IP", + "value": "10.0.0.1" + }, + { + "name": "KUBE_SVC_REDIRECTOR_PROXY_IP", + "value": "127.0.0.1:14612" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2", + "imagePullPolicy": "IfNotPresent", + "name": "redirector", + "resources": { + "requests": { + "cpu": "5m", + "memory": "2Mi" + } + }, + "securityContext": { + "capabilities": { + "add": [ + "NET_ADMIN" + ] + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + }, + { + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imagePullPolicy": "IfNotPresent", + "name": "azureproxy", + "ports": [ + { + "containerPort": 14612, + "hostPort": 14612, + "protocol": "TCP" + } + ], + "resources": { + "requests": { + "cpu": "5m", + "memory": "32Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/nginx/nginx.conf", + "name": "azureproxy-nginx", + "readOnly": true, + "subPath": "nginx.conf" + }, + { + "mountPath": "/etc/nginx/conf.d", + "name": "azureproxy-configs", + "readOnly": true + }, + { + "mountPath": "/etc/nginx/certs", + "name": "azureproxy-certs", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-0", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-svc-redirector", + "serviceAccountName": "kube-svc-redirector", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-nginx" + }, + "name": "azureproxy-nginx" + }, + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-config" + }, + "name": "azureproxy-configs" + }, + { + "name": "azureproxy-certs", + "secret": { + "defaultMode": 420, + "secretName": "azureproxy-certs" + } + }, + { + "name": "kube-svc-redirector-token-ngjg2", + "secret": { + "defaultMode": 420, + "secretName": "kube-svc-redirector-token-ngjg2" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:54:26Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:55:03Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:54:26Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://942d4ddc66e488245fa77cf331a38de7df760d5d5d96b344f5bfbc84adbab861", + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/nginx@sha256:91d22184f3f9b1be658c2cc2c12d324de7ff12c8b9c9a597905457b4d93b069d", + "lastState": {}, + "name": "azureproxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-15T20:55:02Z" + } + } + }, + { + "containerID": "docker://71d6f73215c0994fa2f7b340732d5e4453a86ece31dcf5278fb2abc32e3e4de2", + "image": "aksrepos.azurecr.io/mirror/kube-svc-redirect:v1.0.2", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "lastState": {}, + "name": "redirector", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-15T20:54:36Z" + } + } + } + ], + "hostIP": "10.240.0.4", + "phase": "Running", + "podIP": "10.240.0.4", + "qosClass": "Burstable", + "startTime": "2019-07-15T20:54:26Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-08-07T18:57:56Z", + "generateName": "kube-svc-redirect-", + "labels": { + "component": "kube-svc-redirect", + "controller-revision-hash": "1216437240", + "pod-template-generation": "9", + "tier": "node" + }, + "name": "kube-svc-redirect-mqk98", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-svc-redirect", + "uid": "45a5fc62-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "16965477", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-svc-redirect-mqk98", + "uid": "44a61692-b945-11e9-a1b6-127094e7fd94" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "env": [ + { + "name": "KUBERNETES_SVC_IP", + "value": "10.0.0.1" + }, + { + "name": "KUBE_SVC_REDIRECTOR_PROXY_IP", + "value": "127.0.0.1:14612" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2", + "imagePullPolicy": "IfNotPresent", + "name": "redirector", + "resources": { + "requests": { + "cpu": "5m", + "memory": "2Mi" + } + }, + "securityContext": { + "capabilities": { + "add": [ + "NET_ADMIN" + ] + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + }, + { + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imagePullPolicy": "IfNotPresent", + "name": "azureproxy", + "ports": [ + { + "containerPort": 14612, + "hostPort": 14612, + "protocol": "TCP" + } + ], + "resources": { + "requests": { + "cpu": "5m", + "memory": "32Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/nginx/nginx.conf", + "name": "azureproxy-nginx", + "readOnly": true, + "subPath": "nginx.conf" + }, + { + "mountPath": "/etc/nginx/conf.d", + "name": "azureproxy-configs", + "readOnly": true + }, + { + "mountPath": "/etc/nginx/certs", + "name": "azureproxy-certs", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-3", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-svc-redirector", + "serviceAccountName": "kube-svc-redirector", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-nginx" + }, + "name": "azureproxy-nginx" + }, + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-config" + }, + "name": "azureproxy-configs" + }, + { + "name": "azureproxy-certs", + "secret": { + "defaultMode": 420, + "secretName": "azureproxy-certs" + } + }, + { + "name": "kube-svc-redirector-token-ngjg2", + "secret": { + "defaultMode": 420, + "secretName": "kube-svc-redirector-token-ngjg2" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-07T18:57:58Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-07T18:58:09Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-07T18:57:58Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://5f47547dc8e4fceb8e2a6e01cee5612b49e2dc2d5682b6a58f648d8223b3a6b0", + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/nginx@sha256:91d22184f3f9b1be658c2cc2c12d324de7ff12c8b9c9a597905457b4d93b069d", + "lastState": {}, + "name": "azureproxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-07T18:58:09Z" + } + } + }, + { + "containerID": "docker://5da4e17288399f8e2d4998e5c06159d0d2d39690e89195c5381ab7e3c91aaf99", + "image": "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2", + "imageID": "docker-pullable://aksrepos.azurecr.io/prod/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "lastState": {}, + "name": "redirector", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-07T18:58:08Z" + } + } + } + ], + "hostIP": "10.240.0.6", + "phase": "Running", + "podIP": "10.240.0.6", + "qosClass": "Burstable", + "startTime": "2019-08-07T18:57:58Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-07-15T20:55:38Z", + "generateName": "kube-svc-redirect-", + "labels": { + "component": "kube-svc-redirect", + "controller-revision-hash": "1216437240", + "pod-template-generation": "9", + "tier": "node" + }, + "name": "kube-svc-redirect-qf4tl", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-svc-redirect", + "uid": "45a5fc62-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "15144014", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-svc-redirect-qf4tl", + "uid": "e690309f-a742-11e9-a38a-22d1c75c4357" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "env": [ + { + "name": "KUBERNETES_SVC_IP", + "value": "10.0.0.1" + }, + { + "name": "KUBE_SVC_REDIRECTOR_PROXY_IP", + "value": "127.0.0.1:14612" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2", + "imagePullPolicy": "IfNotPresent", + "name": "redirector", + "resources": { + "requests": { + "cpu": "5m", + "memory": "2Mi" + } + }, + "securityContext": { + "capabilities": { + "add": [ + "NET_ADMIN" + ] + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + }, + { + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imagePullPolicy": "IfNotPresent", + "name": "azureproxy", + "ports": [ + { + "containerPort": 14612, + "hostPort": 14612, + "protocol": "TCP" + } + ], + "resources": { + "requests": { + "cpu": "5m", + "memory": "32Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/nginx/nginx.conf", + "name": "azureproxy-nginx", + "readOnly": true, + "subPath": "nginx.conf" + }, + { + "mountPath": "/etc/nginx/conf.d", + "name": "azureproxy-configs", + "readOnly": true + }, + { + "mountPath": "/etc/nginx/certs", + "name": "azureproxy-certs", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-1", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-svc-redirector", + "serviceAccountName": "kube-svc-redirector", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-nginx" + }, + "name": "azureproxy-nginx" + }, + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-config" + }, + "name": "azureproxy-configs" + }, + { + "name": "azureproxy-certs", + "secret": { + "defaultMode": 420, + "secretName": "azureproxy-certs" + } + }, + { + "name": "kube-svc-redirector-token-ngjg2", + "secret": { + "defaultMode": 420, + "secretName": "kube-svc-redirector-token-ngjg2" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:55:38Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:55:47Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:55:38Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://a0fa774ceba9ae78cf75ffb96a0d8f3ca4d48e5d9d17218957b07e8b1e7e2862", + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/nginx@sha256:91d22184f3f9b1be658c2cc2c12d324de7ff12c8b9c9a597905457b4d93b069d", + "lastState": {}, + "name": "azureproxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-15T20:55:46Z" + } + } + }, + { + "containerID": "docker://7f281954c57ff6529aaeea2e79dc45a8abeabd4b360c2bbea5c0830ddac4f093", + "image": "aksrepos.azurecr.io/mirror/kube-svc-redirect:v1.0.2", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "lastState": {}, + "name": "redirector", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-15T20:55:44Z" + } + } + } + ], + "hostIP": "10.240.0.5", + "phase": "Running", + "podIP": "10.240.0.5", + "qosClass": "Burstable", + "startTime": "2019-07-15T20:55:38Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-07-15T20:56:33Z", + "generateName": "kube-svc-redirect-", + "labels": { + "component": "kube-svc-redirect", + "controller-revision-hash": "1216437240", + "pod-template-generation": "9", + "tier": "node" + }, + "name": "kube-svc-redirect-rtw2t", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "kube-svc-redirect", + "uid": "45a5fc62-44e5-11e9-9920-423525a6b683" + } + ], + "resourceVersion": "15144039", + "selfLink": "/api/v1/namespaces/kube-system/pods/kube-svc-redirect-rtw2t", + "uid": "06fef5f6-a743-11e9-a38a-22d1c75c4357" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "env": [ + { + "name": "KUBERNETES_SVC_IP", + "value": "10.0.0.1" + }, + { + "name": "KUBE_SVC_REDIRECTOR_PROXY_IP", + "value": "127.0.0.1:14612" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/prod/kube-svc-redirect:v1.0.2", + "imagePullPolicy": "IfNotPresent", + "name": "redirector", + "resources": { + "requests": { + "cpu": "5m", + "memory": "2Mi" + } + }, + "securityContext": { + "capabilities": { + "add": [ + "NET_ADMIN" + ] + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + }, + { + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imagePullPolicy": "IfNotPresent", + "name": "azureproxy", + "ports": [ + { + "containerPort": 14612, + "hostPort": 14612, + "protocol": "TCP" + } + ], + "resources": { + "requests": { + "cpu": "5m", + "memory": "32Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/nginx/nginx.conf", + "name": "azureproxy-nginx", + "readOnly": true, + "subPath": "nginx.conf" + }, + { + "mountPath": "/etc/nginx/conf.d", + "name": "azureproxy-configs", + "readOnly": true + }, + { + "mountPath": "/etc/nginx/certs", + "name": "azureproxy-certs", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kube-svc-redirector-token-ngjg2", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-2", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kube-svc-redirector", + "serviceAccountName": "kube-svc-redirector", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/network-unavailable", + "operator": "Exists" + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-nginx" + }, + "name": "azureproxy-nginx" + }, + { + "configMap": { + "defaultMode": 420, + "name": "azureproxy-config" + }, + "name": "azureproxy-configs" + }, + { + "name": "azureproxy-certs", + "secret": { + "defaultMode": 420, + "secretName": "azureproxy-certs" + } + }, + { + "name": "kube-svc-redirector-token-ngjg2", + "secret": { + "defaultMode": 420, + "secretName": "kube-svc-redirector-token-ngjg2" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:56:33Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:56:49Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-15T20:56:33Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://aaea93b1e6a0c55e9ac0c002ffa6fdfb99e98b2f1a38c474cc2b9b65e947b6d9", + "image": "aksrepos.azurecr.io/mirror/nginx:1.13.12-alpine", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/nginx@sha256:91d22184f3f9b1be658c2cc2c12d324de7ff12c8b9c9a597905457b4d93b069d", + "lastState": {}, + "name": "azureproxy", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-15T20:56:48Z" + } + } + }, + { + "containerID": "docker://c03c8b9e99095205945e15bef5f60c0501c8a0a77186afc1fcc8eb0804274e78", + "image": "aksrepos.azurecr.io/mirror/kube-svc-redirect:v1.0.2", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/kube-svc-redirect@sha256:a448687b78d24dae388bd3d54591c179c891fa078404752bc9c9dfdaecdc02ef", + "lastState": {}, + "name": "redirector", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-15T20:56:43Z" + } + } + } + ], + "hostIP": "10.240.0.7", + "phase": "Running", + "podIP": "10.240.0.7", + "qosClass": "Burstable", + "startTime": "2019-07-15T20:56:33Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-07-09T02:38:07Z", + "generateName": "kubernetes-dashboard-6dcdfcd68b-", + "labels": { + "k8s-app": "kubernetes-dashboard", + "kubernetes.io/cluster-service": "true", + "pod-template-hash": "2878978246" + }, + "name": "kubernetes-dashboard-6dcdfcd68b-nfqbf", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "kubernetes-dashboard-6dcdfcd68b", + "uid": "71ff2821-a1f2-11e9-9bc6-127bb0ec03b8" + } + ], + "resourceVersion": "15831517", + "selfLink": "/api/v1/namespaces/kube-system/pods/kubernetes-dashboard-6dcdfcd68b-nfqbf", + "uid": "9583b2ab-a1f2-11e9-8b08-d602e29755d5" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/kubernetes-dashboard-amd64:v1.10.1", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "failureThreshold": 3, + "httpGet": { + "path": "/", + "port": 9090, + "scheme": "HTTP" + }, + "initialDelaySeconds": 30, + "periodSeconds": 10, + "successThreshold": 1, + "timeoutSeconds": 30 + }, + "name": "main", + "ports": [ + { + "containerPort": 9090, + "name": "http", + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "cpu": "100m", + "memory": "500Mi" + }, + "requests": { + "cpu": "100m", + "memory": "50Mi" + } + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "kubernetes-dashboard-token-w4t8s", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-0", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "kubernetes-dashboard", + "serviceAccountName": "kubernetes-dashboard", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "name": "kubernetes-dashboard-token-w4t8s", + "secret": { + "defaultMode": 420, + "secretName": "kubernetes-dashboard-token-w4t8s" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:14Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:39:08Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:07Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://2b042ce7bdf3d03cb606317b19ee797cbf7b99c65076a67001064bccb313b3cb", + "image": "aksrepos.azurecr.io/mirror/kubernetes-dashboard-amd64:v1.10.1", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/kubernetes-dashboard-amd64@sha256:0ae6b69432e78069c5ce2bcde0fe409c5c4d6f0f4d9cd50a17974fea38898747", + "lastState": {}, + "name": "main", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:39:07Z" + } + } + } + ], + "hostIP": "10.240.0.4", + "phase": "Running", + "podIP": "10.244.1.197", + "qosClass": "Burstable", + "startTime": "2019-07-09T02:38:14Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-07-09T02:38:06Z", + "generateName": "metrics-server-76cd9fb66-", + "labels": { + "k8s-app": "metrics-server", + "pod-template-hash": "327859622" + }, + "name": "metrics-server-76cd9fb66-h2q55", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "metrics-server-76cd9fb66", + "uid": "71c837df-a1f2-11e9-9bc6-127bb0ec03b8" + } + ], + "resourceVersion": "15144037", + "selfLink": "/api/v1/namespaces/kube-system/pods/metrics-server-76cd9fb66-h2q55", + "uid": "9543dbb7-a1f2-11e9-8b08-d602e29755d5" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": [ + "/metrics-server", + "--source=kubernetes.summary_api:''" + ], + "env": [ + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/mirror/metrics-server-amd64:v0.2.1", + "imagePullPolicy": "IfNotPresent", + "name": "metrics-server", + "resources": {}, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "metrics-server-token-qtdgm", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-1", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "metrics-server", + "serviceAccountName": "metrics-server", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "name": "metrics-server-token-qtdgm", + "secret": { + "defaultMode": 420, + "secretName": "metrics-server-token-qtdgm" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:09Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:20Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-07-09T02:38:07Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://f60ef82657e5ccdfb611a4f3381848dff77a01bddf95c431e4b7a2bf6f4b8087", + "image": "aksrepos.azurecr.io/mirror/metrics-server-amd64:v0.2.1", + "imageID": "docker-pullable://aksrepos.azurecr.io/mirror/metrics-server-amd64@sha256:220c0ed3451cb95e4b2f72dd5dc8d9d39d9f529722e5b29d8286373ce27b117e", + "lastState": {}, + "name": "metrics-server", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-07-09T02:38:18Z" + } + } + } + ], + "hostIP": "10.240.0.5", + "phase": "Running", + "podIP": "10.244.0.193", + "qosClass": "BestEffort", + "startTime": "2019-07-09T02:38:09Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "agentVersion": "1.10.0.1", + "dockerProviderVersion": "6.0.0-0", + "schema-versions": "v1" + }, + "creationTimestamp": "2019-08-23T19:53:57Z", + "generateName": "omsagent-", + "labels": { + "controller-revision-hash": "868116844", + "dsName": "omsagent-ds", + "pod-template-generation": "9" + }, + "name": "omsagent-25pks", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "omsagent", + "uid": "e2f8c552-c2d2-11e9-8736-86290fd7dd1f" + } + ], + "resourceVersion": "19063729", + "selfLink": "/api/v1/namespaces/kube-system/pods/omsagent-25pks", + "uid": "be78d7f6-c5df-11e9-8736-86290fd7dd1f" + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "AKS_RESOURCE_ID", + "value": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test" + }, + { + "name": "AKS_REGION", + "value": "eastus" + }, + { + "name": "CONTROLLER_TYPE", + "value": "DaemonSet" + }, + { + "name": "NODE_IP", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "status.hostIP" + } + } + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/bin/bash", + "-c", + "/opt/livenessprobe.sh" + ] + }, + "failureThreshold": 3, + "initialDelaySeconds": 60, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "omsagent", + "ports": [ + { + "containerPort": 25225, + "protocol": "TCP" + }, + { + "containerPort": 25224, + "protocol": "UDP" + } + ], + "resources": { + "limits": { + "cpu": "150m", + "memory": "600Mi" + }, + "requests": { + "cpu": "75m", + "memory": "225Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/hostfs", + "name": "host-root", + "readOnly": true + }, + { + "mountPath": "/var/run/host", + "name": "docker-sock" + }, + { + "mountPath": "/var/log", + "name": "host-log" + }, + { + "mountPath": "/var/lib/docker/containers", + "name": "containerlog-path" + }, + { + "mountPath": "/etc/kubernetes/host", + "name": "azure-json-path" + }, + { + "mountPath": "/etc/omsagent-secret", + "name": "omsagent-secret" + }, + { + "mountPath": "/etc/config/settings", + "name": "settings-vol-config", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "omsagent-token-fjmqb", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "nodeName": "aks-nodepool1-19574989-2", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 0, + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "omsagent", + "serviceAccountName": "omsagent", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/", + "type": "" + }, + "name": "host-root" + }, + { + "hostPath": { + "path": "/var/run", + "type": "" + }, + "name": "docker-sock" + }, + { + "hostPath": { + "path": "/etc/hostname", + "type": "" + }, + "name": "container-hostname" + }, + { + "hostPath": { + "path": "/var/log", + "type": "" + }, + "name": "host-log" + }, + { + "hostPath": { + "path": "/var/lib/docker/containers", + "type": "" + }, + "name": "containerlog-path" + }, + { + "hostPath": { + "path": "/etc/kubernetes", + "type": "" + }, + "name": "azure-json-path" + }, + { + "name": "omsagent-secret", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-secret" + } + }, + { + "configMap": { + "defaultMode": 420, + "name": "container-azm-ms-agentconfig", + "optional": true + }, + "name": "settings-vol-config" + }, + { + "name": "omsagent-token-fjmqb", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-token-fjmqb" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:53:57Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:54:44Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:53:57Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://acd5cedc2c5874122047c47bb1398f35a7c0297292fc4a0e01345123c233d19a", + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imageID": "docker-pullable://mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "lastState": {}, + "name": "omsagent", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T19:54:43Z" + } + } + } + ], + "hostIP": "10.240.0.7", + "phase": "Running", + "podIP": "10.244.12.169", + "qosClass": "Burstable", + "startTime": "2019-08-23T19:53:57Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "agentVersion": "1.10.0.1", + "dockerProviderVersion": "6.0.0-0", + "schema-versions": "v1" + }, + "creationTimestamp": "2019-08-23T19:51:35Z", + "generateName": "omsagent-", + "labels": { + "controller-revision-hash": "868116844", + "dsName": "omsagent-ds", + "pod-template-generation": "9" + }, + "name": "omsagent-4tncr", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "omsagent", + "uid": "e2f8c552-c2d2-11e9-8736-86290fd7dd1f" + } + ], + "resourceVersion": "19063468", + "selfLink": "/api/v1/namespaces/kube-system/pods/omsagent-4tncr", + "uid": "69e68b21-c5df-11e9-8736-86290fd7dd1f" + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "AKS_RESOURCE_ID", + "value": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test" + }, + { + "name": "AKS_REGION", + "value": "eastus" + }, + { + "name": "CONTROLLER_TYPE", + "value": "DaemonSet" + }, + { + "name": "NODE_IP", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "status.hostIP" + } + } + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/bin/bash", + "-c", + "/opt/livenessprobe.sh" + ] + }, + "failureThreshold": 3, + "initialDelaySeconds": 60, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "omsagent", + "ports": [ + { + "containerPort": 25225, + "protocol": "TCP" + }, + { + "containerPort": 25224, + "protocol": "UDP" + } + ], + "resources": { + "limits": { + "cpu": "150m", + "memory": "600Mi" + }, + "requests": { + "cpu": "75m", + "memory": "225Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/hostfs", + "name": "host-root", + "readOnly": true + }, + { + "mountPath": "/var/run/host", + "name": "docker-sock" + }, + { + "mountPath": "/var/log", + "name": "host-log" + }, + { + "mountPath": "/var/lib/docker/containers", + "name": "containerlog-path" + }, + { + "mountPath": "/etc/kubernetes/host", + "name": "azure-json-path" + }, + { + "mountPath": "/etc/omsagent-secret", + "name": "omsagent-secret" + }, + { + "mountPath": "/etc/config/settings", + "name": "settings-vol-config", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "omsagent-token-fjmqb", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "nodeName": "aks-nodepool1-19574989-1", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 0, + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "omsagent", + "serviceAccountName": "omsagent", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/", + "type": "" + }, + "name": "host-root" + }, + { + "hostPath": { + "path": "/var/run", + "type": "" + }, + "name": "docker-sock" + }, + { + "hostPath": { + "path": "/etc/hostname", + "type": "" + }, + "name": "container-hostname" + }, + { + "hostPath": { + "path": "/var/log", + "type": "" + }, + "name": "host-log" + }, + { + "hostPath": { + "path": "/var/lib/docker/containers", + "type": "" + }, + "name": "containerlog-path" + }, + { + "hostPath": { + "path": "/etc/kubernetes", + "type": "" + }, + "name": "azure-json-path" + }, + { + "name": "omsagent-secret", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-secret" + } + }, + { + "configMap": { + "defaultMode": 420, + "name": "container-azm-ms-agentconfig", + "optional": true + }, + "name": "settings-vol-config" + }, + { + "name": "omsagent-token-fjmqb", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-token-fjmqb" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:51:35Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:52:28Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:51:35Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://7803b80452aa34460c848d9c1ca65d6bd925665cf78faaa8dbc122482f93c744", + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imageID": "docker-pullable://mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "lastState": {}, + "name": "omsagent", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T19:52:27Z" + } + } + } + ], + "hostIP": "10.240.0.5", + "phase": "Running", + "podIP": "10.244.0.251", + "qosClass": "Burstable", + "startTime": "2019-08-23T19:51:35Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "agentVersion": "1.10.0.1", + "dockerProviderVersion": "6.0.0-0", + "schema-versions": "v1" + }, + "creationTimestamp": "2019-08-23T19:53:36Z", + "generateName": "omsagent-", + "labels": { + "controller-revision-hash": "868116844", + "dsName": "omsagent-ds", + "pod-template-generation": "9" + }, + "name": "omsagent-h44fk", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "omsagent", + "uid": "e2f8c552-c2d2-11e9-8736-86290fd7dd1f" + } + ], + "resourceVersion": "19063631", + "selfLink": "/api/v1/namespaces/kube-system/pods/omsagent-h44fk", + "uid": "b1e04e1c-c5df-11e9-8736-86290fd7dd1f" + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "AKS_RESOURCE_ID", + "value": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test" + }, + { + "name": "AKS_REGION", + "value": "eastus" + }, + { + "name": "CONTROLLER_TYPE", + "value": "DaemonSet" + }, + { + "name": "NODE_IP", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "status.hostIP" + } + } + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/bin/bash", + "-c", + "/opt/livenessprobe.sh" + ] + }, + "failureThreshold": 3, + "initialDelaySeconds": 60, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "omsagent", + "ports": [ + { + "containerPort": 25225, + "protocol": "TCP" + }, + { + "containerPort": 25224, + "protocol": "UDP" + } + ], + "resources": { + "limits": { + "cpu": "150m", + "memory": "600Mi" + }, + "requests": { + "cpu": "75m", + "memory": "225Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/hostfs", + "name": "host-root", + "readOnly": true + }, + { + "mountPath": "/var/run/host", + "name": "docker-sock" + }, + { + "mountPath": "/var/log", + "name": "host-log" + }, + { + "mountPath": "/var/lib/docker/containers", + "name": "containerlog-path" + }, + { + "mountPath": "/etc/kubernetes/host", + "name": "azure-json-path" + }, + { + "mountPath": "/etc/omsagent-secret", + "name": "omsagent-secret" + }, + { + "mountPath": "/etc/config/settings", + "name": "settings-vol-config", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "omsagent-token-fjmqb", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "nodeName": "aks-nodepool1-19574989-0", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 0, + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "omsagent", + "serviceAccountName": "omsagent", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/", + "type": "" + }, + "name": "host-root" + }, + { + "hostPath": { + "path": "/var/run", + "type": "" + }, + "name": "docker-sock" + }, + { + "hostPath": { + "path": "/etc/hostname", + "type": "" + }, + "name": "container-hostname" + }, + { + "hostPath": { + "path": "/var/log", + "type": "" + }, + "name": "host-log" + }, + { + "hostPath": { + "path": "/var/lib/docker/containers", + "type": "" + }, + "name": "containerlog-path" + }, + { + "hostPath": { + "path": "/etc/kubernetes", + "type": "" + }, + "name": "azure-json-path" + }, + { + "name": "omsagent-secret", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-secret" + } + }, + { + "configMap": { + "defaultMode": 420, + "name": "container-azm-ms-agentconfig", + "optional": true + }, + "name": "settings-vol-config" + }, + { + "name": "omsagent-token-fjmqb", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-token-fjmqb" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:53:36Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:53:51Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:53:36Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://4b71a82e472a8e5d0bc4ef9b9b5d2ccf25741b31269480a77e29424ebe87757c", + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imageID": "docker-pullable://mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "lastState": {}, + "name": "omsagent", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T19:53:49Z" + } + } + } + ], + "hostIP": "10.240.0.4", + "phase": "Running", + "podIP": "10.244.1.35", + "qosClass": "Burstable", + "startTime": "2019-08-23T19:53:36Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "agentVersion": "1.10.0.1", + "dockerProviderVersion": "6.0.0-0", + "schema-versions": "v1" + }, + "creationTimestamp": "2019-08-23T19:51:28Z", + "generateName": "omsagent-rs-5bb85d7468-", + "labels": { + "pod-template-hash": "1664183024", + "rsName": "omsagent-rs" + }, + "name": "omsagent-rs-5bb85d7468-dnxpw", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "omsagent-rs-5bb85d7468", + "uid": "659ec974-c5df-11e9-8736-86290fd7dd1f" + } + ], + "resourceVersion": "19063495", + "selfLink": "/api/v1/namespaces/kube-system/pods/omsagent-rs-5bb85d7468-dnxpw", + "uid": "65a6f978-c5df-11e9-8736-86290fd7dd1f" + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "AKS_RESOURCE_ID", + "value": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test" + }, + { + "name": "AKS_REGION", + "value": "eastus" + }, + { + "name": "CONTROLLER_TYPE", + "value": "ReplicaSet" + }, + { + "name": "NODE_IP", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "status.hostIP" + } + } + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/bin/bash", + "-c", + "/opt/livenessprobe.sh" + ] + }, + "failureThreshold": 3, + "initialDelaySeconds": 60, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "omsagent", + "ports": [ + { + "containerPort": 25225, + "protocol": "TCP" + }, + { + "containerPort": 25224, + "protocol": "UDP" + }, + { + "containerPort": 25227, + "name": "in-rs-tcp", + "protocol": "TCP" + } + ], + "resources": { + "limits": { + "cpu": "150m", + "memory": "500Mi" + }, + "requests": { + "cpu": "110m", + "memory": "250Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/var/run/host", + "name": "docker-sock" + }, + { + "mountPath": "/var/log", + "name": "host-log" + }, + { + "mountPath": "/var/lib/docker/containers", + "name": "containerlog-path" + }, + { + "mountPath": "/etc/kubernetes/host", + "name": "azure-json-path" + }, + { + "mountPath": "/etc/omsagent-secret", + "name": "omsagent-secret", + "readOnly": true + }, + { + "mountPath": "/etc/config", + "name": "omsagent-rs-config" + }, + { + "mountPath": "/etc/config/settings", + "name": "settings-vol-config", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "omsagent-token-fjmqb", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "nodeName": "aks-nodepool1-19574989-0", + "nodeSelector": { + "beta.kubernetes.io/os": "linux", + "kubernetes.io/role": "agent" + }, + "priority": 0, + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "omsagent", + "serviceAccountName": "omsagent", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "hostPath": { + "path": "/var/run", + "type": "" + }, + "name": "docker-sock" + }, + { + "hostPath": { + "path": "/etc/hostname", + "type": "" + }, + "name": "container-hostname" + }, + { + "hostPath": { + "path": "/var/log", + "type": "" + }, + "name": "host-log" + }, + { + "hostPath": { + "path": "/var/lib/docker/containers", + "type": "" + }, + "name": "containerlog-path" + }, + { + "hostPath": { + "path": "/etc/kubernetes", + "type": "" + }, + "name": "azure-json-path" + }, + { + "name": "omsagent-secret", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-secret" + } + }, + { + "configMap": { + "defaultMode": 420, + "name": "omsagent-rs-config" + }, + "name": "omsagent-rs-config" + }, + { + "configMap": { + "defaultMode": 420, + "name": "container-azm-ms-agentconfig", + "optional": true + }, + "name": "settings-vol-config" + }, + { + "name": "omsagent-token-fjmqb", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-token-fjmqb" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:51:28Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:52:37Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:51:28Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://7e080036bc213a7dadd95b1d8439e06a1b62822219642a83cab059dc4292b0e5", + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imageID": "docker-pullable://mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "lastState": {}, + "name": "omsagent", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T19:52:37Z" + } + } + } + ], + "hostIP": "10.240.0.4", + "phase": "Running", + "podIP": "10.244.1.34", + "qosClass": "Burstable", + "startTime": "2019-08-23T19:51:28Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "annotations": { + "agentVersion": "1.10.0.1", + "dockerProviderVersion": "6.0.0-0", + "schema-versions": "v1" + }, + "creationTimestamp": "2019-08-23T19:52:35Z", + "generateName": "omsagent-", + "labels": { + "controller-revision-hash": "868116844", + "dsName": "omsagent-ds", + "pod-template-generation": "9" + }, + "name": "omsagent-sb6xx", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "DaemonSet", + "name": "omsagent", + "uid": "e2f8c552-c2d2-11e9-8736-86290fd7dd1f" + } + ], + "resourceVersion": "19063577", + "selfLink": "/api/v1/namespaces/kube-system/pods/omsagent-sb6xx", + "uid": "8dbd5e8b-c5df-11e9-8736-86290fd7dd1f" + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "AKS_RESOURCE_ID", + "value": "/subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourcegroups/dilipr-health-test/providers/Microsoft.ContainerService/managedClusters/dilipr-health-test" + }, + { + "name": "AKS_REGION", + "value": "eastus" + }, + { + "name": "CONTROLLER_TYPE", + "value": "DaemonSet" + }, + { + "name": "NODE_IP", + "valueFrom": { + "fieldRef": { + "apiVersion": "v1", + "fieldPath": "status.hostIP" + } + } + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/bin/bash", + "-c", + "/opt/livenessprobe.sh" + ] + }, + "failureThreshold": 3, + "initialDelaySeconds": 60, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "omsagent", + "ports": [ + { + "containerPort": 25225, + "protocol": "TCP" + }, + { + "containerPort": 25224, + "protocol": "UDP" + } + ], + "resources": { + "limits": { + "cpu": "150m", + "memory": "600Mi" + }, + "requests": { + "cpu": "75m", + "memory": "225Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/hostfs", + "name": "host-root", + "readOnly": true + }, + { + "mountPath": "/var/run/host", + "name": "docker-sock" + }, + { + "mountPath": "/var/log", + "name": "host-log" + }, + { + "mountPath": "/var/lib/docker/containers", + "name": "containerlog-path" + }, + { + "mountPath": "/etc/kubernetes/host", + "name": "azure-json-path" + }, + { + "mountPath": "/etc/omsagent-secret", + "name": "omsagent-secret" + }, + { + "mountPath": "/etc/config/settings", + "name": "settings-vol-config", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "omsagent-token-fjmqb", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "ClusterFirst", + "nodeName": "aks-nodepool1-19574989-3", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 0, + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "omsagent", + "serviceAccountName": "omsagent", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "effect": "NoSchedule", + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/memory-pressure", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "node.kubernetes.io/unschedulable", + "operator": "Exists" + } + ], + "volumes": [ + { + "hostPath": { + "path": "/", + "type": "" + }, + "name": "host-root" + }, + { + "hostPath": { + "path": "/var/run", + "type": "" + }, + "name": "docker-sock" + }, + { + "hostPath": { + "path": "/etc/hostname", + "type": "" + }, + "name": "container-hostname" + }, + { + "hostPath": { + "path": "/var/log", + "type": "" + }, + "name": "host-log" + }, + { + "hostPath": { + "path": "/var/lib/docker/containers", + "type": "" + }, + "name": "containerlog-path" + }, + { + "hostPath": { + "path": "/etc/kubernetes", + "type": "" + }, + "name": "azure-json-path" + }, + { + "name": "omsagent-secret", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-secret" + } + }, + { + "configMap": { + "defaultMode": 420, + "name": "container-azm-ms-agentconfig", + "optional": true + }, + "name": "settings-vol-config" + }, + { + "name": "omsagent-token-fjmqb", + "secret": { + "defaultMode": 420, + "secretName": "omsagent-token-fjmqb" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:52:35Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:53:25Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-23T19:52:35Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://f4f0cb19e5da394a4332847953c18d9321319f2ef422533b890ab844cb997879", + "image": "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08222019", + "imageID": "docker-pullable://mcr.microsoft.com/azuremonitor/containerinsights/ciprod@sha256:69b420bdb4081293c37e2d0f8ad2e4054bd516f5c08c7512d6b695660a36eccf", + "lastState": {}, + "name": "omsagent", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-23T19:53:24Z" + } + } + } + ], + "hostIP": "10.240.0.6", + "phase": "Running", + "podIP": "10.244.2.62", + "qosClass": "Burstable", + "startTime": "2019-08-23T19:52:35Z" + } + }, + { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "creationTimestamp": "2019-08-12T20:28:08Z", + "generateName": "tunnelfront-65c8cfb7cc-", + "labels": { + "component": "tunnel", + "pod-template-hash": "2174796377" + }, + "name": "tunnelfront-65c8cfb7cc-z8srb", + "namespace": "kube-system", + "ownerReferences": [ + { + "apiVersion": "apps/v1", + "blockOwnerDeletion": true, + "controller": true, + "kind": "ReplicaSet", + "name": "tunnelfront-65c8cfb7cc", + "uid": "7013afa3-a742-11e9-a08d-96dd47774ee5" + } + ], + "resourceVersion": "17628809", + "selfLink": "/api/v1/namespaces/kube-system/pods/tunnelfront-65c8cfb7cc-z8srb", + "uid": "b2a0e1b3-bd3f-11e9-b2a7-d61658c73830" + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.azure.com/cluster", + "operator": "Exists" + } + ] + } + ] + } + } + }, + "containers": [ + { + "env": [ + { + "name": "OVERRIDE_TUNNEL_SERVER_NAME", + "value": "t_dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "TUNNEL_CLUSTERUSER_NAME", + "value": "28957308" + }, + { + "name": "TUNNELGATEWAY_SERVER_NAME", + "value": "dilipr-hea-dilipr-health-te-72c8e8-0b16acad.tun.eastus.azmk8s.io" + }, + { + "name": "TUNNELGATEWAY_SSH_PORT", + "value": "22" + }, + { + "name": "TUNNELGATEWAY_TLS_PORT", + "value": "443" + }, + { + "name": "KUBE_CONFIG", + "value": "/etc/kubernetes/kubeconfig/kubeconfig" + }, + { + "name": "KUBERNETES_PORT_443_TCP_ADDR", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + }, + { + "name": "KUBERNETES_PORT", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_PORT_443_TCP", + "value": "tcp://dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io:443" + }, + { + "name": "KUBERNETES_SERVICE_HOST", + "value": "dilipr-hea-dilipr-health-te-72c8e8-d3ccfd8f.hcp.eastus.azmk8s.io" + } + ], + "image": "aksrepos.azurecr.io/prod/hcp-tunnel-front:v1.9.2-v4.0.7", + "imagePullPolicy": "IfNotPresent", + "livenessProbe": { + "exec": { + "command": [ + "/lib/tunnel-front/check-tunnel-connection.sh" + ] + }, + "failureThreshold": 12, + "initialDelaySeconds": 10, + "periodSeconds": 60, + "successThreshold": 1, + "timeoutSeconds": 1 + }, + "name": "tunnel-front", + "resources": { + "requests": { + "cpu": "10m", + "memory": "64Mi" + } + }, + "securityContext": { + "privileged": true + }, + "terminationMessagePath": "/dev/termination-log", + "terminationMessagePolicy": "File", + "volumeMounts": [ + { + "mountPath": "/etc/kubernetes/kubeconfig", + "name": "kubeconfig", + "readOnly": true + }, + { + "mountPath": "/etc/kubernetes/certs", + "name": "certificates", + "readOnly": true + }, + { + "mountPath": "/var/run/secrets/kubernetes.io/serviceaccount", + "name": "tunnelfront-token-njgvg", + "readOnly": true + } + ] + } + ], + "dnsPolicy": "Default", + "imagePullSecrets": [ + { + "name": "emptyacrsecret" + } + ], + "nodeName": "aks-nodepool1-19574989-3", + "nodeSelector": { + "beta.kubernetes.io/os": "linux" + }, + "priority": 2000001000, + "priorityClassName": "system-node-critical", + "restartPolicy": "Always", + "schedulerName": "default-scheduler", + "securityContext": {}, + "serviceAccount": "tunnelfront", + "serviceAccountName": "tunnelfront", + "terminationGracePeriodSeconds": 30, + "tolerations": [ + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/not-ready", + "operator": "Exists", + "tolerationSeconds": 300 + }, + { + "effect": "NoExecute", + "key": "node.kubernetes.io/unreachable", + "operator": "Exists", + "tolerationSeconds": 300 + } + ], + "volumes": [ + { + "configMap": { + "defaultMode": 420, + "name": "tunnelfront-kubecfg", + "optional": true + }, + "name": "kubeconfig" + }, + { + "hostPath": { + "path": "/etc/kubernetes/certs", + "type": "" + }, + "name": "certificates" + }, + { + "name": "tunnelfront-token-njgvg", + "secret": { + "defaultMode": 420, + "secretName": "tunnelfront-token-njgvg" + } + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-12T20:28:08Z", + "status": "True", + "type": "Initialized" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-12T20:28:13Z", + "status": "True", + "type": "Ready" + }, + { + "lastProbeTime": null, + "lastTransitionTime": null, + "status": "True", + "type": "ContainersReady" + }, + { + "lastProbeTime": null, + "lastTransitionTime": "2019-08-12T20:28:08Z", + "status": "True", + "type": "PodScheduled" + } + ], + "containerStatuses": [ + { + "containerID": "docker://ac3b7482b15ba1f825e7a9ceef11defaccdc2682b9a20bb7c98bc307a8a34cf6", + "image": "aksrepos.azurecr.io/prod/hcp-tunnel-front:v1.9.2-v4.0.7", + "imageID": "docker-pullable://aksrepos.azurecr.io/prod/hcp-tunnel-front@sha256:68878ee3ea1781b322ea3952c3370e31dd89be8bb0864e2bf27bdba6dc904c41", + "lastState": {}, + "name": "tunnel-front", + "ready": true, + "restartCount": 0, + "state": { + "running": { + "startedAt": "2019-08-12T20:28:13Z" + } + } + } + ], + "hostIP": "10.240.0.6", + "phase": "Running", + "podIP": "10.244.2.10", + "qosClass": "Burstable", + "startTime": "2019-08-12T20:28:08Z" + } + } + ], + "kind": "List", + "metadata": { + "resourceVersion": "", + "selfLink": "" + } +} From 382ed0294e57d9ec1dd3e85b0982f5eb3e286084 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 4 Oct 2019 15:54:42 -0700 Subject: [PATCH 126/160] init containers fix and other bug fixes (#269) * init container - KPI and kubeperf changes * changes * changes * changes * changes for empty array fix * changes * changes * pod inventory exception fix * nil check changes * changes * fixing typo * changes * changes * PR - feedback * remove comment * tag pass changes * changes * tagdrop changes * changes * changes --- installer/conf/telegraf.conf | 3 +- .../scripts/tomlparser-prom-customconfig.rb | 2 +- source/code/go/src/plugins/oms.go | 85 ++++++++++--------- source/code/plugin/KubernetesApiClient.rb | 14 ++- source/code/plugin/in_kube_events.rb | 17 ++-- source/code/plugin/in_kube_nodes.rb | 13 ++- source/code/plugin/in_kube_podinventory.rb | 41 ++++++--- source/code/plugin/in_kube_services.rb | 12 ++- 8 files changed, 123 insertions(+), 64 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 4883de81b..cd22a56b4 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -566,7 +566,8 @@ ## Use TLS but skip chain & host verification insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] - + [inputs.prometheus.tagpass] + operation_type = ["create_container", "remove_container", "pull_image"] ## prometheus custom metrics [[inputs.prometheus]] diff --git a/installer/scripts/tomlparser-prom-customconfig.rb b/installer/scripts/tomlparser-prom-customconfig.rb index ab868f1a9..7aad580ee 100644 --- a/installer/scripts/tomlparser-prom-customconfig.rb +++ b/installer/scripts/tomlparser-prom-customconfig.rb @@ -47,7 +47,7 @@ def parseConfigMap end def checkForTypeArray(arrayValue, arrayType) - if (arrayValue.nil? || (arrayValue.kind_of?(Array) && arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) return true else return false diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 6d78455bd..01aab85b4 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -184,11 +184,11 @@ type laKubeMonAgentEvents struct { } type KubeMonAgentEventTags struct { - PodName string - ContainerId string - FirstOccurance string - LastOccurance string - Count int + PodName string + ContainerId string + FirstOccurrence string + LastOccurrence string + Count int } type KubeMonAgentEventBlob struct { @@ -259,7 +259,14 @@ func updateContainerImageNameMaps() { } for _, pod := range pods.Items { - for _, status := range pod.Status.ContainerStatuses { + podContainerStatuses := pod.Status.ContainerStatuses + + // Doing this to include init container logs as well + podInitContainerStatuses := pod.Status.InitContainerStatuses + if (podInitContainerStatuses != nil) && (len(podInitContainerStatuses) > 0) { + podContainerStatuses = append(podContainerStatuses, podInitContainerStatuses...) + } + for _, status := range podContainerStatuses { lastSlashIndex := strings.LastIndex(status.ContainerID, "/") containerID := status.ContainerID[lastSlashIndex+1 : len(status.ContainerID)] image := status.Image @@ -344,22 +351,22 @@ func populateKubeMonAgentEventHash(record map[interface{}]interface{}, errType K if val, ok := ConfigErrorEvent[logRecordString]; ok { Log("In config error existing hash update\n") eventCount := val.Count - eventFirstOccurance := val.FirstOccurance + eventFirstOccurrence := val.FirstOccurrence ConfigErrorEvent[logRecordString] = KubeMonAgentEventTags{ - PodName: podName, - ContainerId: containerID, - FirstOccurance: eventFirstOccurance, - LastOccurance: eventTimeStamp, - Count: eventCount + 1, + PodName: podName, + ContainerId: containerID, + FirstOccurrence: eventFirstOccurrence, + LastOccurrence: eventTimeStamp, + Count: eventCount + 1, } } else { ConfigErrorEvent[logRecordString] = KubeMonAgentEventTags{ - PodName: podName, - ContainerId: containerID, - FirstOccurance: eventTimeStamp, - LastOccurance: eventTimeStamp, - Count: 1, + PodName: podName, + ContainerId: containerID, + FirstOccurrence: eventTimeStamp, + LastOccurrence: eventTimeStamp, + Count: 1, } } @@ -374,22 +381,22 @@ func populateKubeMonAgentEventHash(record map[interface{}]interface{}, errType K if val, ok := PromScrapeErrorEvent[splitString]; ok { Log("In config error existing hash update\n") eventCount := val.Count - eventFirstOccurance := val.FirstOccurance + eventFirstOccurrence := val.FirstOccurrence PromScrapeErrorEvent[splitString] = KubeMonAgentEventTags{ - PodName: podName, - ContainerId: containerID, - FirstOccurance: eventFirstOccurance, - LastOccurance: eventTimeStamp, - Count: eventCount + 1, + PodName: podName, + ContainerId: containerID, + FirstOccurrence: eventFirstOccurrence, + LastOccurrence: eventTimeStamp, + Count: eventCount + 1, } } else { PromScrapeErrorEvent[splitString] = KubeMonAgentEventTags{ - PodName: podName, - ContainerId: containerID, - FirstOccurance: eventTimeStamp, - LastOccurance: eventTimeStamp, - Count: 1, + PodName: podName, + ContainerId: containerID, + FirstOccurrence: eventTimeStamp, + LastOccurrence: eventTimeStamp, + Count: 1, } } } @@ -756,16 +763,18 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { FlushedRecordsSize += float64(len(stringMap["LogEntry"])) dataItems = append(dataItems, dataItem) - loggedTime, e := time.Parse(time.RFC3339, dataItem.LogEntryTimeStamp) - if e != nil { - message := fmt.Sprintf("Error while converting LogEntryTimeStamp for telemetry purposes: %s", e.Error()) - Log(message) - SendException(message) - } else { - ltncy := float64(start.Sub(loggedTime) / time.Millisecond) - if ltncy >= maxLatency { - maxLatency = ltncy - maxLatencyContainer = dataItem.Name + "=" + dataItem.ID + if dataItem.LogEntryTimeStamp != "" { + loggedTime, e := time.Parse(time.RFC3339, dataItem.LogEntryTimeStamp) + if e != nil { + message := fmt.Sprintf("Error while converting LogEntryTimeStamp for telemetry purposes: %s", e.Error()) + Log(message) + SendException(message) + } else { + ltncy := float64(start.Sub(loggedTime) / time.Millisecond) + if ltncy >= maxLatency { + maxLatency = ltncy + maxLatencyContainer = dataItem.Name + "=" + dataItem.ID + } } } } diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 48b25bf14..be1a51791 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -356,9 +356,19 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName else podUid = pod["metadata"]["uid"] end - if (!pod["spec"]["containers"].nil? && !pod["spec"]["nodeName"].nil?) + + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end + + if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?) nodeName = pod["spec"]["nodeName"] - pod["spec"]["containers"].each do |container| + podContainers.each do |container| containerName = container["name"] metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 5538ba4aa..e1fdc5df6 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -47,17 +47,20 @@ def enumerate(eventList = nil) currentTime = Time.now emitTime = currentTime.to_f batchTime = currentTime.utc.iso8601 - if eventList.nil? - $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") - events = JSON.parse(KubernetesApiClient.getKubeResourceInfo("events").body) - $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") - else - events = eventList + + events = eventList + $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") + eventInfo = KubernetesApiClient.getKubeResourceInfo("events") + $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") + + if !eventInfo.nil? + events = JSON.parse(eventInfo.body) end + eventQueryState = getEventQueryState newEventQueryState = [] begin - if (!events.empty? && !events["items"].nil?) + if (!events.nil? && !events.empty? && !events["items"].nil?) eventStream = MultiEventStream.new events["items"].each do |items| record = {} diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 42bc13b68..0a0fd9d2e 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -61,11 +61,19 @@ def enumerate emitTime = currentTime.to_f batchTime = currentTime.utc.iso8601 telemetrySent = false + + nodeInventory = nil + $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + nodeInfo = KubernetesApiClient.getKubeResourceInfo("nodes") $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + + if !nodeInfo.nil? + nodeInventory = JSON.parse(nodeInfo.body) + end + begin - if (!nodeInventory.empty?) + if (!nodeInventory.nil? && !nodeInventory.empty?) eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new if !nodeInventory["items"].nil? @@ -95,7 +103,6 @@ def enumerate record["KubernetesProviderID"] = "onprem" end - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index f41ce9095..766831a66 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -48,13 +48,15 @@ def shutdown end def enumerate(podList = nil) - if podList.nil? - $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods").body) - $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - else - podInventory = podList + podInventory = podList + $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo = KubernetesApiClient.getKubeResourceInfo("pods") + $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + + if !podInfo.nil? + podInventory = JSON.parse(podInfo.body) end + begin if (!podInventory.empty? && podInventory.key?("items") && !podInventory["items"].empty?) #get pod inventory & services @@ -137,8 +139,16 @@ def getContainerEnvironmentVariables(pod, clusterCollectEnvironmentVar) begin podSpec = pod["spec"] containerEnvHash = {} - if !podSpec.nil? && !podSpec["containers"].nil? - podSpec["containers"].each do |container| + podContainersEnv = [] + if !podSpec["containers"].nil? && !podSpec["containers"].empty? + podContainersEnv = podContainersEnv + podSpec["containers"] + end + # Adding init containers to the record list as well. + if !podSpec["initContainers"].nil? && !podSpec["initContainers"].empty? + podContainersEnv = podContainersEnv + podSpec["initContainers"] + end + if !podContainersEnv.nil? && !podContainersEnv.empty? + podContainersEnv.each do |container| if !clusterCollectEnvironmentVar.nil? && !clusterCollectEnvironmentVar.empty? && clusterCollectEnvironmentVar.casecmp("false") == 0 containerEnvHash[container["name"]] = ["AZMON_CLUSTER_COLLECT_ENV_VAR=FALSE"] else @@ -289,8 +299,19 @@ def parse_and_emit_records(podInventory, serviceList) end podRestartCount = 0 record["PodRestartCount"] = 0 - if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start - items["status"]["containerStatuses"].each do |container| + + podContainers = [] + if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? + podContainers = podContainers + items["status"]["containerStatuses"] + end + # Adding init containers to the record list as well. + if items["status"].key?("initContainerStatuses") && !items["status"]["initContainerStatuses"].empty? + podContainers = podContainers + items["status"]["initContainerStatuses"] + end + + # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start + if !podContainers.empty? #container status block start + podContainers.each do |container| containerRestartCount = 0 #container Id is of the form #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 diff --git a/source/code/plugin/in_kube_services.rb b/source/code/plugin/in_kube_services.rb index 8b0a013e4..7cd703620 100644 --- a/source/code/plugin/in_kube_services.rb +++ b/source/code/plugin/in_kube_services.rb @@ -46,11 +46,19 @@ def enumerate currentTime = Time.now emitTime = currentTime.to_f batchTime = currentTime.utc.iso8601 + + serviceList = nil + $log.info("in_kube_services::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") $log.info("in_kube_services::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + + if !serviceInfo.nil? + serviceList = JSON.parse(serviceInfo.body) + end + begin - if (!serviceList.empty?) + if (!serviceList.nil? && !serviceList.empty?) eventStream = MultiEventStream.new serviceList["items"].each do |items| record = {} From 3079471a69f9d704e6de55200857ac5489866fc2 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 7 Oct 2019 11:33:57 -0700 Subject: [PATCH 127/160] Send agg monitor signal on details change (#270) send when an agg monitor details change, but state did not change --- .../plugin/filter_health_model_builder.rb | 3 ++- .../plugin/health/health_monitor_state.rb | 25 ++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index 5aa7f610e..fa92038e6 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -184,7 +184,8 @@ def filter_stream(tag, es) all_monitors.each{|monitor_instance_id, monitor| if monitor.is_aggregate_monitor @state.update_state(monitor, - @provider.get_config(monitor.monitor_id) + @provider.get_config(monitor.monitor_id), + true ) end diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb index 498c75ec7..7eb674f1e 100644 --- a/source/code/plugin/health/health_monitor_state.rb +++ b/source/code/plugin/health/health_monitor_state.rb @@ -57,10 +57,11 @@ def initialize_state(deserialized_state) 2. if there is a "consistent" state change for monitors 3. if the signal is stale (> 4hrs) 4. If the latest state is none +5. If an aggregate monitor has a change in its details, but no change in state =end def update_state(monitor, #UnitMonitor/AggregateMonitor - monitor_config #Hash - ) + monitor_config, #Hash + is_aggregate_monitor = false) samples_to_keep = 1 monitor_instance_id = monitor.monitor_instance_id log = HealthMonitorHelpers.get_log_handle @@ -76,12 +77,13 @@ def update_state(monitor, #UnitMonitor/AggregateMonitor samples_to_keep = monitor_config['ConsecutiveSamplesForStateTransition'].to_i end + deleted_record = {} if @@monitor_states.key?(monitor_instance_id) health_monitor_instance_state = @@monitor_states[monitor_instance_id] health_monitor_records = health_monitor_instance_state.prev_records #This should be an array if health_monitor_records.size == samples_to_keep - health_monitor_records.delete_at(0) + deleted_record = health_monitor_records.delete_at(0) end health_monitor_records.push(monitor.details) health_monitor_instance_state.prev_records = health_monitor_records @@ -106,7 +108,6 @@ def update_state(monitor, #UnitMonitor/AggregateMonitor @@monitor_states[monitor_instance_id] = health_monitor_instance_state end - # update old and new state based on the history and latest record. # TODO: this is a little hairy. Simplify @@ -142,6 +143,10 @@ def update_state(monitor, #UnitMonitor/AggregateMonitor @@first_record_sent[monitor_instance_id] = true health_monitor_instance_state.should_send = true set_state(monitor_instance_id, health_monitor_instance_state) + elsif agg_monitor_details_changed?(is_aggregate_monitor, deleted_record, health_monitor_instance_state.prev_records[0]) + health_monitor_instance_state.should_send = true + set_state(monitor_instance_id, health_monitor_instance_state) + log.debug "#{monitor_instance_id} condition: agg monitor details changed should_send #{health_monitor_instance_state.should_send}" end # latest state is different that last sent state else @@ -212,5 +217,17 @@ def is_state_change_consistent(health_monitor_records, samples_to_check) end return true end + + def agg_monitor_details_changed?(is_aggregate_monitor, last_sent_details, latest_details) + log = HealthMonitorHelpers.get_log_handle + if !is_aggregate_monitor + return false + end + if latest_details['details'] != last_sent_details['details'] + log.info "Last Sent Details #{JSON.pretty_generate(last_sent_details)} \n Latest Details: #{JSON.pretty_generate(latest_details)}" + return true + end + return false + end end end \ No newline at end of file From de2e1da41dcd08c3407569bf41c53f5281b46331 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 9 Oct 2019 17:04:16 -0700 Subject: [PATCH 128/160] bug fixes for error (#274) --- source/code/go/src/plugins/oms.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 01aab85b4..b68c471a1 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -413,7 +413,6 @@ func flushKubeMonAgentEventRecords() { Log("In flushConfigErrorRecords\n") start := time.Now() var resp *http.Response - var postError error var elapsed time.Duration var laKubeMonAgentEventsRecords []laKubeMonAgentEvents telemetryDimensions := make(map[string]string) @@ -518,10 +517,10 @@ func flushKubeMonAgentEventRecords() { req.Header.Set("x-ms-AzureResourceId", ResourceID) } - resp, postError = HTTPClient.Do(req) + resp, err = HTTPClient.Do(req) elapsed = time.Since(start) - if postError != nil { + if err != nil { message := fmt.Sprintf("Error when sending kubemonagentevent request %s \n", err.Error()) Log(message) Log("Failed to flush %d records after %s", len(laKubeMonAgentEventsRecords), elapsed) @@ -532,7 +531,7 @@ func flushKubeMonAgentEventRecords() { Log("Failed to flush %d records after %s", len(laKubeMonAgentEventsRecords), elapsed) } else { numRecords := len(laKubeMonAgentEventsRecords) - Log("Successfully flushed %d records in %s", numRecords, elapsed) + Log("FlushKubeMonAgentEventRecords::Info::Successfully flushed %d records in %s", numRecords, elapsed) // Send telemetry to AppInsights resource SendEvent(KubeMonAgentEventsFlushedEvent, telemetryDimensions) @@ -822,7 +821,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { defer resp.Body.Close() numRecords := len(dataItems) - Log("Successfully flushed %d records in %s", numRecords, elapsed) + Log("PostDataHelper::Info::Successfully flushed %d records in %s", numRecords, elapsed) ContainerLogTelemetryMutex.Lock() FlushedRecordsCount += float64(numRecords) FlushedRecordsTimeTaken += float64(elapsed / time.Millisecond) From e4b91c51dff06558a9b048669f07462d0df47d88 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 9 Oct 2019 17:39:54 -0700 Subject: [PATCH 129/160] Fix to use declaration and assignment instead of assignment (#275) * bug fixes for error * adding declaration to assignment --- source/code/go/src/plugins/oms.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index b68c471a1..123aea197 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -517,7 +517,7 @@ func flushKubeMonAgentEventRecords() { req.Header.Set("x-ms-AzureResourceId", ResourceID) } - resp, err = HTTPClient.Do(req) + resp, err := HTTPClient.Do(req) elapsed = time.Since(start) if err != nil { From cf5e85ccf7e841bf79117beb15608d0a5f8e533b Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 9 Oct 2019 18:01:40 -0700 Subject: [PATCH 130/160] 1. Added telemetry (#277) 2. Configuration property changes 3. Bug fixes for a. unscheduled pods returning green 3b. Sometimes, the details hash of agg monitors are different because the order of elements inside the array is different, causing the records to be sent --- installer/conf/healthmonitorconfig.json | 40 ++++++------- installer/datafiles/base_container.data | 1 + .../plugin/filter_health_model_builder.rb | 14 +++-- .../health_container_cpu_memory_aggregator.rb | 4 +- .../plugin/health/health_monitor_helpers.rb | 2 - .../plugin/health/health_monitor_state.rb | 42 ++++++++++++-- .../plugin/health/health_monitor_telemetry.rb | 57 +++++++++++++++++++ .../plugin/health/health_monitor_utils.rb | 11 ++-- .../health/health_monitor_state_spec.rb | 12 ++-- 9 files changed, 134 insertions(+), 49 deletions(-) create mode 100644 source/code/plugin/health/health_monitor_telemetry.rb diff --git a/installer/conf/healthmonitorconfig.json b/installer/conf/healthmonitorconfig.json index ea6b23856..e4019fe73 100644 --- a/installer/conf/healthmonitorconfig.json +++ b/installer/conf/healthmonitorconfig.json @@ -1,40 +1,34 @@ { "node_cpu_utilization": { - "WarnThresholdPercentage": 80.0, - "FailThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3, - "Operator": ">" + "WarnIfGreaterThanPercentage": 80.0, + "FailIfGreaterThanPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 }, "node_memory_utilization": { - "WarnThresholdPercentage": 80.0, - "FailThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3, - "Operator": ">" + "WarnIfGreaterThanPercentage": 80.0, + "FailIfGreaterThanPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 3 }, "container_cpu_utilization": { - "WarnThresholdPercentage": 80.0, - "FailThresholdPercentage": 90.0, + "WarnIfGreaterThanPercentage": 80.0, + "FailIfGreaterThanPercentage": 90.0, "StateThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3, - "Operator": ">" + "ConsecutiveSamplesForStateTransition": 3 }, "container_memory_utilization": { - "WarnThresholdPercentage": 80.0, - "FailThresholdPercentage": 90.0, + "WarnIfGreaterThanPercentage": 80.0, + "FailIfGreaterThanPercentage": 90.0, "StateThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 3, - "Operator": ">" + "ConsecutiveSamplesForStateTransition": 3 }, "user_workload_pods_ready": { - "WarnThresholdPercentage": 100.0, - "FailThresholdPercentage": 90.0, - "ConsecutiveSamplesForStateTransition": 2, - "Operator": "<" + "WarnIfLesserThanPercentage": 100.0, + "FailIfLesserThanPercentage": 90.0, + "ConsecutiveSamplesForStateTransition": 2 }, "system_workload_pods_ready": { - "FailThresholdPercentage": 100.0, - "ConsecutiveSamplesForStateTransition": 2, - "Operator": "<" + "FailIfLesserThanPercentage": 100.0, + "ConsecutiveSamplesForStateTransition": 2 }, "node_condition": { "NodeConditionTypesForFailedState": "outofdisk,networkunavailable" diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 981f51f4c..4ebc4f338 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -147,6 +147,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/health/health_monitor_provider.rb; source/code/plugin/health/health_monitor_provider.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_record.rb; source/code/plugin/health/health_monitor_record.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_state.rb; source/code/plugin/health/health_monitor_state.rb; 644; root; root +/opt/microsoft/omsagent/plugin/health/health_monitor_telemetry.rb; source/code/plugin/health/health_monitor_telemetry.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_utils.rb; source/code/plugin/health/health_monitor_utils.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_signal_reducer.rb; source/code/plugin/health/health_signal_reducer.rb; 644; root; root diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index fa92038e6..afb514a73 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -16,7 +16,7 @@ class FilterHealthModelBuilder < Filter config_param :model_definition_path, :default => '/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json' config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' config_param :health_state_serialized_path, :default => '/mnt/azure/health_model_state.json' - attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator + attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator, :telemetry include HealthModel @@rewrite_tag = 'kubehealth.Signals' @@ -49,6 +49,7 @@ def initialize @cluster_old_state = 'none' @cluster_new_state = 'none' @container_cpu_memory_records = [] + @telemetry = HealthMonitorTelemetry.new rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -142,7 +143,9 @@ def filter_stream(tag, es) reduced_records = @reducer.reduce_signals(health_monitor_records, @resources) reduced_records.each{|record| @state.update_state(record, - @provider.get_config(record.monitor_id) + @provider.get_config(record.monitor_id), + false, + @telemetry ) # get the health state based on the monitor's operational state # update state calls updates the state of the monitor based on configuration and history of the the monitor records @@ -160,7 +163,7 @@ def filter_stream(tag, es) #update state for missing signals missing_signals.each{|signal| - @state.update_state(signal, @provider.get_config(signal.monitor_id)) + @state.update_state(signal, @provider.get_config(signal.monitor_id), false, @telemetry) @log.info "After Updating #{@state.get_state(signal.monitor_instance_id)} #{@state.get_state(signal.monitor_instance_id).new_state}" # for unknown/none records, update the "monitor state" to be the latest state (new_state) of the monitor instance from the state signal.state = @state.get_state(signal.monitor_instance_id).new_state @@ -185,7 +188,8 @@ def filter_stream(tag, es) if monitor.is_aggregate_monitor @state.update_state(monitor, @provider.get_config(monitor.monitor_id), - true + true, + @telemetry ) end @@ -242,7 +246,7 @@ def filter_stream(tag, es) #update cluster state custom resource @cluster_health_state.update_state(@state.to_h) - + @telemetry.send # return an empty event stream, else the match will throw a NoMethodError return MultiEventStream.new elsif tag.start_with?("kubehealth.Signals") diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index e98c288b3..6de146e3d 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -246,9 +246,9 @@ def calculate_monitor_state(v, config) def calculate_container_instance_state(counter_value, limit, config) percent_value = counter_value * 100 / limit - if percent_value > config['FailThresholdPercentage'] + if percent_value > config['FailIfGreaterThanPercentage'] return HealthMonitorStates::FAIL - elsif percent_value > config['WarnThresholdPercentage'] + elsif percent_value > config['WarnIfGreaterThanPercentage'] return HealthMonitorStates::WARN else return HealthMonitorStates::PASS diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb index 4efd4c608..f784ae76e 100644 --- a/source/code/plugin/health/health_monitor_helpers.rb +++ b/source/code/plugin/health/health_monitor_helpers.rb @@ -43,11 +43,9 @@ def add_agentpool_node_label_if_not_present(records) if labels_keys.include?(HealthMonitorLabels::AGENTPOOL) @log.info "#{record.monitor_id} includes agentpool label. Value = #{record.labels[HealthMonitorLabels::AGENTPOOL]}" - @log.info "Labels present = #{labels_keys}" next else #@log.info "#{record} does not include agentpool label." - @log.info "Labels present = #{labels_keys}" role_name = 'unknown' if record.labels.include?(HealthMonitorLabels::ROLE) role_name = record.labels[HealthMonitorLabels::ROLE] diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb index 7eb674f1e..cac66f26b 100644 --- a/source/code/plugin/health/health_monitor_state.rb +++ b/source/code/plugin/health/health_monitor_state.rb @@ -16,6 +16,7 @@ def initialize @@monitor_states = {} @@first_record_sent = {} @@health_signal_timeout = 240 + end def get_state(monitor_instance_id) @@ -46,7 +47,6 @@ def initialize_state(deserialized_state) state.should_send = health_monitor_instance_state_hash["should_send"] @@monitor_states[k] = state @@first_record_sent[k] = true - } end @@ -61,8 +61,11 @@ def initialize_state(deserialized_state) =end def update_state(monitor, #UnitMonitor/AggregateMonitor monitor_config, #Hash - is_aggregate_monitor = false) + is_aggregate_monitor = false, + telemetry = nil + ) samples_to_keep = 1 + monitor_id = monitor.monitor_id monitor_instance_id = monitor.monitor_instance_id log = HealthMonitorHelpers.get_log_handle current_time = Time.now.utc.iso8601 @@ -157,6 +160,11 @@ def update_state(monitor, #UnitMonitor/AggregateMonitor health_monitor_instance_state.state_change_time = current_time health_monitor_instance_state.prev_sent_record_time = current_time health_monitor_instance_state.should_send = true + if !is_aggregate_monitor + if !telemetry.nil? + telemetry.add_monitor_to_telemetry(monitor_id, health_monitor_instance_state.old_state, health_monitor_instance_state.new_state) + end + end if !@@first_record_sent.key?(monitor_instance_id) @@first_record_sent[monitor_instance_id] = true end @@ -170,6 +178,11 @@ def update_state(monitor, #UnitMonitor/AggregateMonitor health_monitor_instance_state.state_change_time = current_time health_monitor_instance_state.prev_sent_record_time = current_time health_monitor_instance_state.should_send = true + if !is_aggregate_monitor + if !telemetry.nil? + telemetry.add_monitor_to_telemetry(monitor_id, health_monitor_instance_state.old_state, health_monitor_instance_state.new_state) + end + end if !@@first_record_sent.key?(monitor_instance_id) @@first_record_sent[monitor_instance_id] = true end @@ -190,6 +203,11 @@ def update_state(monitor, #UnitMonitor/AggregateMonitor health_monitor_instance_state.new_state = latest_record_state health_monitor_instance_state.prev_sent_record_time = current_time health_monitor_instance_state.state_change_time = current_time + if !is_aggregate_monitor + if !telemetry.nil? + telemetry.add_monitor_to_telemetry(monitor_id, health_monitor_instance_state.old_state, health_monitor_instance_state.new_state) + end + end set_state(monitor_instance_id, health_monitor_instance_state) @@ -223,10 +241,22 @@ def agg_monitor_details_changed?(is_aggregate_monitor, last_sent_details, latest if !is_aggregate_monitor return false end - if latest_details['details'] != last_sent_details['details'] - log.info "Last Sent Details #{JSON.pretty_generate(last_sent_details)} \n Latest Details: #{JSON.pretty_generate(latest_details)}" - return true - end + # Do a deep comparison of the keys under details, since a shallow comparison is hit or miss. + # Actual bug was the array inside the keys were in random order and the previous equality comparison was failing + latest_details['details'].keys.each{|k| + if !last_sent_details['details'].key?(k) + return true + end + if latest_details['details'][k].size != last_sent_details['details'][k].size + return true + end + } + # Explanation: a = [1,2] b = [2,1] a & b = [1,2] , c = [2,3] d = [2] c & d = [2] c.size != (c&d).size + latest_details['details'].keys.each{|k| + if !(latest_details['details'][k].size == (last_sent_details['details'][k] & latest_details['details'][k]).size) + return true + end + } return false end end diff --git a/source/code/plugin/health/health_monitor_telemetry.rb b/source/code/plugin/health/health_monitor_telemetry.rb new file mode 100644 index 000000000..df4b98ac8 --- /dev/null +++ b/source/code/plugin/health/health_monitor_telemetry.rb @@ -0,0 +1,57 @@ +require_relative 'health_model_constants' +require 'socket' +if Socket.gethostname.start_with?('omsagent-rs') + require_relative '../ApplicationInsightsUtility' +end + + +module HealthModel + class HealthMonitorTelemetry + + attr_reader :monitor_records, :last_sent_time + @@TELEMETRY_SEND_INTERVAL = 60 + + def initialize + @last_sent_time = Time.now + end + + def send + if Time.now > @last_sent_time + @@TELEMETRY_SEND_INTERVAL * 60 + log = HealthMonitorHelpers.get_log_handle + log.info "Sending #{@monitor_records.size} state change events" + if @monitor_records.size > 0 + hash_to_send = {} + @monitor_records.each{|k,v| + v.each{|k1,v1| + hash_to_send["#{k}-#{k1}"] = v1 + } + } + ApplicationInsightsUtility.sendCustomEvent("HealthMonitorStateChangeEvent", hash_to_send) + end + @monitor_records = {} + @last_sent_time = Time.now + end + end + + def add_monitor_to_telemetry(monitor_id, old_state, new_state) + if @monitor_records.nil? || @monitor_records.empty? + @monitor_records = {} + end + if @monitor_records.key?(monitor_id) + monitor_hash = @monitor_records[monitor_id] + if monitor_hash.key?("#{old_state}-#{new_state}") + count = monitor_hash["#{old_state}-#{new_state}"] + count = count + 1 + monitor_hash["#{old_state}-#{new_state}"] = count + else + monitor_hash["#{old_state}-#{new_state}"] = 1 + end + @monitor_records[monitor_id] = monitor_hash + else + monitor_hash = {} + monitor_hash["#{old_state}-#{new_state}"] = 1 + @monitor_records[monitor_id] = monitor_hash + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index 27e9b9a6e..e21fdc83d 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -27,16 +27,17 @@ class HealthMonitorUtils class << self # compute the percentage state given a value and a monitor configuration + #TODO : Add Unit Tests for this method def compute_percentage_state(value, config) - - if config.nil? || config['WarnThresholdPercentage'].nil? + if config.nil? || ( config['WarnIfGreaterThanPercentage'].nil? && config['WarnIfLesserThanPercentage'].nil? ) warn_percentage = nil else - warn_percentage = config['WarnThresholdPercentage'].to_f + warn_percentage = !config['WarnIfGreaterThanPercentage'].nil? ? config['WarnIfGreaterThanPercentage'].to_f : config['WarnIfLesserThanPercentage'].to_f end - fail_percentage = config['FailThresholdPercentage'].to_f + fail_percentage = !config['FailIfGreaterThanPercentage'].nil? ? config['FailIfGreaterThanPercentage'].to_f : config['FailIfLesserThanPercentage'].to_f + is_less_than_comparer = config['FailIfGreaterThanPercentage'].nil? ? true : false # Fail percentage config always present for percentage computation monitors - if !config.nil? && !config['Operator'].nil? && config['Operator'] == '<' + if !config.nil? && is_less_than_comparer if value < fail_percentage return HealthMonitorStates::FAIL elsif !warn_percentage.nil? && value < warn_percentage diff --git a/test/code/plugin/health/health_monitor_state_spec.rb b/test/code/plugin/health/health_monitor_state_spec.rb index 5fa8a6c6e..3d13d4150 100644 --- a/test/code/plugin/health/health_monitor_state_spec.rb +++ b/test/code/plugin/health/health_monitor_state_spec.rb @@ -65,8 +65,8 @@ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end config = JSON.parse('{ - "WarnThresholdPercentage": 80.0, - "FailThresholdPercentage": 90.0, + "WarnIfGreaterThanPercentage": 80.0, + "FailIfGreaterThanPercentage": 90.0, "ConsecutiveSamplesForStateTransition": 3 }') #act @@ -96,8 +96,8 @@ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end config = JSON.parse('{ - "WarnThresholdPercentage": 80.0, - "FailThresholdPercentage": 90.0, + "WarnIfGreaterThanPercentage": 80.0, + "FailIfGreaterThanPercentage": 90.0, "ConsecutiveSamplesForStateTransition": 3 }') #act @@ -136,8 +136,8 @@ def mock_monitor.transition_date_time; Time.now.utc.iso8601; end def mock_monitor.details; {"state" => "pass", "timestamp" => Time.now.utc.iso8601, "details" => {}}; end config = JSON.parse('{ - "WarnThresholdPercentage": 80.0, - "FailThresholdPercentage": 90.0, + "WarnIfGreaterThanPercentage": 80.0, + "FailIfGreaterThanPercentage": 90.0, "ConsecutiveSamplesForStateTransition": 3 }') #act From e8529b2fc3ab96603c00eda162715bf8518b74b9 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 10 Oct 2019 10:51:55 -0700 Subject: [PATCH 131/160] Bug fix to remove unused variable (#281) * bug fixes for error * adding declaration to assignment * removing unused variable --- source/code/go/src/plugins/oms.go | 1 - 1 file changed, 1 deletion(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 123aea197..5a323d7e0 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -412,7 +412,6 @@ func flushKubeMonAgentEventRecords() { if skipKubeMonEventsFlush != true { Log("In flushConfigErrorRecords\n") start := time.Now() - var resp *http.Response var elapsed time.Duration var laKubeMonAgentEventsRecords []laKubeMonAgentEvents telemetryDimensions := make(map[string]string) From 8a4147d7a9eaddc2023152d84ccfe06b27034a6b Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 11 Oct 2019 15:33:32 -0700 Subject: [PATCH 132/160] Fix the WARN<->WARNING typo (#282) --- .../plugin/health/health_container_cpu_memory_aggregator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index 6de146e3d..ef1016158 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -249,7 +249,7 @@ def calculate_container_instance_state(counter_value, limit, config) if percent_value > config['FailIfGreaterThanPercentage'] return HealthMonitorStates::FAIL elsif percent_value > config['WarnIfGreaterThanPercentage'] - return HealthMonitorStates::WARN + return HealthMonitorStates::WARNING else return HealthMonitorStates::PASS end From 4780c3e17fc98ccba9381d60c8945ed7c90c6301 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Sun, 13 Oct 2019 19:16:02 -0700 Subject: [PATCH 133/160] Bug Fixes 1. telemetry send throwing exception if records not initialized 2. permissions error in on-prem clusters (#284) * Bug fixes 1. not writeable, telemetry error * Change to state_WS_dir --- installer/conf/container.conf | 2 +- source/code/plugin/health/health_monitor_telemetry.rb | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 5f08043c7..f9540bde8 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -88,7 +88,7 @@ @type file - path /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + path %STATE_DIR_WS%/fluent_forward_failed.buffer diff --git a/source/code/plugin/health/health_monitor_telemetry.rb b/source/code/plugin/health/health_monitor_telemetry.rb index df4b98ac8..4e80a5145 100644 --- a/source/code/plugin/health/health_monitor_telemetry.rb +++ b/source/code/plugin/health/health_monitor_telemetry.rb @@ -13,6 +13,7 @@ class HealthMonitorTelemetry def initialize @last_sent_time = Time.now + @monitor_records = {} end def send From 981018cafd6bc336cb463472e8c15b740c57b7cc Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 17 Oct 2019 21:49:22 -0700 Subject: [PATCH 134/160] Fix Require relative revert (#287) --- source/code/plugin/health/agg_monitor_id_labels.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb index bb016adb4..d5c724a86 100644 --- a/source/code/plugin/health/agg_monitor_id_labels.rb +++ b/source/code/plugin/health/agg_monitor_id_labels.rb @@ -1,3 +1,5 @@ +require_relative 'health_model_constants' + module HealthModel class AggregateMonitorInstanceIdLabels @@id_labels_mapping = { From edaa963477a1bdea67a508457ff3ac5340d3727f Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 1 Nov 2019 16:16:50 -0700 Subject: [PATCH 135/160] Bug Fixes for exceptions in telemetry, remove limit set check (#289) * Bug Fixes 10222019 * Initialize container_cpu_memory_records in fhmb * Added telemetry to investigate health exceptions * Set frozen_string_literal to true * Send event once per container when lookup is empty, or limit is an array * Unit Tests, Use RS and POD to determine workload * Fixed Node Condition Bug, added exception handling to return get_rs_owner_ref --- .../plugin/filter_health_model_builder.rb | 31 +- .../plugin/health/agg_monitor_id_labels.rb | 1 + .../aggregate_monitor_state_finalizer.rb | 2 + .../plugin/health/cluster_health_state.rb | 2 + .../health_container_cpu_memory_aggregator.rb | 71 +- ...h_container_cpu_memory_record_formatter.rb | 2 + .../plugin/health/health_hierarchy_builder.rb | 2 + .../health/health_kube_api_down_handler.rb | 2 + .../health/health_kubernetes_resources.rb | 90 +- .../health/health_missing_signal_generator.rb | 2 + .../code/plugin/health/health_model_buffer.rb | 2 + .../plugin/health/health_model_builder.rb | 1 + .../plugin/health/health_model_constants.rb | 1 + .../health/health_model_definition_parser.rb | 1 + .../plugin/health/health_monitor_helpers.rb | 1 + .../plugin/health/health_monitor_optimizer.rb | 1 + .../plugin/health/health_monitor_provider.rb | 1 + .../plugin/health/health_monitor_record.rb | 1 + .../plugin/health/health_monitor_state.rb | 1 + .../plugin/health/health_monitor_telemetry.rb | 1 + .../plugin/health/health_monitor_utils.rb | 73 +- .../plugin/health/health_signal_reducer.rb | 1 + source/code/plugin/health/monitor_factory.rb | 1 + .../plugin/health/parent_monitor_provider.rb | 1 + source/code/plugin/health/unit_monitor.rb | 1 + source/code/plugin/in_kube_health.rb | 22 +- source/code/plugin/out_health_forward.rb | 1 + ...th_container_cpu_memory_aggregator_spec.rb | 8 +- .../health/health_kubernetes_resource_spec.rb | 26 +- .../health/health_model_builder_test.rb | 977 +++++++++--------- 30 files changed, 680 insertions(+), 647 deletions(-) diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index afb514a73..47ce7a631 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -39,17 +39,16 @@ def initialize @kube_api_down_handler = HealthKubeApiDownHandler.new @resources = HealthKubernetesResources.instance @reducer = HealthSignalReducer.new - @state = HealthMonitorState.new @generator = HealthMissingSignalGenerator.new - #TODO: cluster_labels needs to be initialized @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) - deserialized_state_info = @cluster_health_state.get_state - @state = HealthMonitorState.new - @state.initialize_state(deserialized_state_info) @cluster_old_state = 'none' @cluster_new_state = 'none' @container_cpu_memory_records = [] @telemetry = HealthMonitorTelemetry.new + @state = HealthMonitorState.new + # move network calls to the end. This will ensure all the instance variables get initialized + deserialized_state_info = @cluster_health_state.get_state + @state.initialize_state(deserialized_state_info) rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -99,6 +98,10 @@ def filter_stream(tag, es) end container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) deduped_records = container_records_aggregator.dedupe_records(container_records) + if @container_cpu_memory_records.nil? + @log.info "@container_cpu_memory_records was not initialized" + @container_cpu_memory_records = [] #in some clusters, this is null, so initialize it again. + end @container_cpu_memory_records.push(*deduped_records) # push the records for aggregation later return MultiEventStream.new elsif tag.start_with?("kubehealth.ReplicaSet") @@ -106,14 +109,16 @@ def filter_stream(tag, es) es.each{|time, record| records.push(record) } - @buffer.add_to_buffer(records) - - container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) - container_records_aggregator.aggregate(@container_cpu_memory_records) - container_records_aggregator.compute_state - aggregated_container_records = container_records_aggregator.get_records - @buffer.add_to_buffer(aggregated_container_records) - + @buffer.add_to_buffer(records) # in_kube_health records + + aggregated_container_records = [] + if !@container_cpu_memory_records.nil? && !@container_cpu_memory_records.empty? + container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) + container_records_aggregator.aggregate(@container_cpu_memory_records) + container_records_aggregator.compute_state + aggregated_container_records = container_records_aggregator.get_records + end + @buffer.add_to_buffer(aggregated_container_records) #container cpu/memory records records_to_process = @buffer.get_buffer @buffer.reset_buffer @container_cpu_memory_records = [] diff --git a/source/code/plugin/health/agg_monitor_id_labels.rb b/source/code/plugin/health/agg_monitor_id_labels.rb index d5c724a86..03680d054 100644 --- a/source/code/plugin/health/agg_monitor_id_labels.rb +++ b/source/code/plugin/health/agg_monitor_id_labels.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/aggregate_monitor_state_finalizer.rb b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb index 74e780924..dd69c9c4d 100644 --- a/source/code/plugin/health/aggregate_monitor_state_finalizer.rb +++ b/source/code/plugin/health/aggregate_monitor_state_finalizer.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module HealthModel class AggregateMonitorStateFinalizer diff --git a/source/code/plugin/health/cluster_health_state.rb b/source/code/plugin/health/cluster_health_state.rb index 3b56dd243..fa9cb42b2 100644 --- a/source/code/plugin/health/cluster_health_state.rb +++ b/source/code/plugin/health/cluster_health_state.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require "net/http" require "net/https" require "uri" diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index ef1016158..f6b57e0ae 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -1,4 +1,12 @@ +# frozen_string_literal: true + require_relative 'health_model_constants' + +# Require only when running inside container. +# otherwise unit tests will fail due to ApplicationInsightsUtility dependency on base omsagent ruby files. If you have your dev machine starting with omsagent-rs, then GOOD LUCK! +if Socket.gethostname.start_with?('omsagent-rs') + require_relative '../ApplicationInsightsUtility' +end =begin @cpu_records/@memory_records [ @@ -37,6 +45,10 @@ class HealthContainerCpuMemoryAggregator @@memory_counter_name = 'memoryRssBytes' @@cpu_counter_name = 'cpuUsageNanoCores' + @@workload_container_count_empty_event_sent = {} + @@limit_is_array_event_sent = {} + @@WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT = "WorkloadContainerCountEmptyEvent" + @@LIMIT_IS_ARRAY_EVENT = "ResourceLimitIsAnArrayEvent" def initialize(resources, provider) @pod_uid_lookup = resources.get_pod_uid_lookup @workload_container_count = resources.get_workload_container_count @@ -163,11 +175,30 @@ def get_records container_cpu_memory_records = [] @cpu_records.each{|resource_key, record| + + cpu_limit_mc = 1.0 + if record["limit"].is_a?(Numeric) + cpu_limit_mc = record["limit"]/1000000.to_f + else + @log.info "CPU Limit is not a number #{record['limit']}" + if !@@limit_is_array_event_sent.key?(resource_key) + custom_properties = {} + custom_properties['limit'] = record['limit'] + if record['limit'].is_a?(Array) + record['limit'].each_index{|i| + custom_properties[i] = record['limit'][i] + } + end + @@limit_is_array_event_sent[resource_key] = true + #send once per resource key + ApplicationInsightsUtility.sendCustomEvent(@@LIMIT_IS_ARRAY_EVENT, custom_properties) + end + end health_monitor_record = { "timestamp" => time_now, "state" => record["state"], "details" => { - "cpu_limit_millicores" => record["limit"]/1000000.to_f, + "cpu_limit_millicores" => cpu_limit_mc, "cpu_usage_instances" => record["records"].map{|r| r.each {|k,v| k == "counter_value" ? r[k] = r[k] / 1000000.to_f : r[k] }}, @@ -219,12 +250,10 @@ def get_records private def calculate_monitor_state(v, config) - if !v['limit_set'] && v['namespace'] != 'kube-system' - v["state"] = HealthMonitorStates::WARNING - else - # sort records by descending order of metric - v["records"] = v["records"].sort_by{|record| record["counter_value"]}.reverse - size = v["records"].size + # sort records by descending order of metric + v["records"] = v["records"].sort_by{|record| record["counter_value"]}.reverse + size = v["records"].size + if !v["record_count"].nil? if size < v["record_count"] unknown_count = v["record_count"] - size for i in unknown_count.downto(1) @@ -232,16 +261,30 @@ def calculate_monitor_state(v, config) v["records"].insert(0, {"counter_value" => -1, "container" => v["container"], "pod_name" => "???", "state" => HealthMonitorStates::UNKNOWN }) #insert -1 for unknown records end end + else + v["state"] = HealthMonitorStates::UNKNOWN + container_key = "#{v['workload_name']}~~#{v['container']}" + @log.info "ContainerKey: #{container_key} Records Size: #{size} Records: #{v['records']} Record Count: #{v['record_count']} #{@workload_container_count}" - if size == 1 - state_index = 0 - else - state_threshold = config['StateThresholdPercentage'].to_f - count = ((state_threshold*size)/100).ceil - state_index = size - count + if !@@workload_container_count_empty_event_sent.key?(container_key) + custom_properties = {} + custom_properties = custom_properties.merge(v) + custom_properties = custom_properties.merge(@workload_container_count) + @log.info "Custom Properties : #{custom_properties}" + @@workload_container_count_empty_event_sent[container_key] = true + ApplicationInsightsUtility.sendCustomEvent(@@WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT, custom_properties) end - v["state"] = v["records"][state_index]["state"] + return #simply return the state as unknown here + end + + if size == 1 + state_index = 0 + else + state_threshold = config['StateThresholdPercentage'].to_f + count = ((state_threshold*size)/100).ceil + state_index = size - count end + v["state"] = v["records"][state_index]["state"] end def calculate_container_instance_state(counter_value, limit, config) diff --git a/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb index 5c7db82d9..0c3f061f1 100644 --- a/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb +++ b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module HealthModel class HealthContainerCpuMemoryRecordFormatter diff --git a/source/code/plugin/health/health_hierarchy_builder.rb b/source/code/plugin/health/health_hierarchy_builder.rb index 2da0050db..bb48e083b 100644 --- a/source/code/plugin/health/health_hierarchy_builder.rb +++ b/source/code/plugin/health/health_hierarchy_builder.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require 'json' module HealthModel class HealthHierarchyBuilder diff --git a/source/code/plugin/health/health_kube_api_down_handler.rb b/source/code/plugin/health/health_kube_api_down_handler.rb index a87c43ef1..bb91f2e3b 100644 --- a/source/code/plugin/health/health_kube_api_down_handler.rb +++ b/source/code/plugin/health/health_kube_api_down_handler.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require_relative 'health_model_constants' module HealthModel class HealthKubeApiDownHandler diff --git a/source/code/plugin/health/health_kubernetes_resources.rb b/source/code/plugin/health/health_kubernetes_resources.rb index 30a9ac7ca..743dd8b94 100644 --- a/source/code/plugin/health/health_kubernetes_resources.rb +++ b/source/code/plugin/health/health_kubernetes_resources.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require 'singleton' require_relative 'health_model_constants' @@ -5,20 +7,20 @@ module HealthModel class HealthKubernetesResources include Singleton - attr_accessor :node_inventory, :pod_inventory, :deployment_inventory, :pod_uid_lookup, :workload_container_count + attr_accessor :node_inventory, :pod_inventory, :replicaset_inventory, :pod_uid_lookup, :workload_container_count attr_reader :nodes, :pods, :workloads, :deployment_lookup def initialize - @node_inventory = [] - @pod_inventory = [] - @deployment_inventory = [] + @node_inventory = {} + @pod_inventory = {} + @replicaset_inventory = {} @nodes = [] @pods = [] @workloads = [] @log = HealthMonitorHelpers.get_log_handle @pod_uid_lookup = {} - @deployment_lookup = {} @workload_container_count = {} + @workload_name_cache = {} end def get_node_inventory @@ -36,9 +38,8 @@ def get_nodes return @nodes end - def set_deployment_inventory(deployments) - @deployment_inventory = deployments - @deployment_lookup = {} + def set_replicaset_inventory(replicasets) + @replicaset_inventory = replicasets end def get_workload_names @@ -51,7 +52,12 @@ def get_workload_names end def build_pod_uid_lookup + if @pod_inventory.nil? || @pod_inventory['items'].nil? || @pod_inventory['items'].empty? || @pod_inventory['items'].size == 0 + @log.info "Not Clearing pod_uid_lookup and workload_container_count since pod inventory is nil" + return + end @workload_container_count = {} + @pod_uid_lookup = {} @pod_inventory['items'].each do |pod| begin namespace = pod['metadata']['namespace'] @@ -92,7 +98,7 @@ def build_pod_uid_lookup end end rescue => e - @log.info "Error in build_pod_uid_lookup #{pod} #{e.message}" + @log.info "Error in build_pod_uid_lookup for POD: #{pod_name} #{e.message} #{e.backtrace}" end end end @@ -105,19 +111,7 @@ def get_workload_container_count return @workload_container_count end - private def get_workload_name(pod) - - if @deployment_lookup.empty? - @deployment_inventory['items'].each do |deployment| - match_labels = deployment['spec']['selector']['matchLabels'].to_h - namespace = deployment['metadata']['namespace'] - match_labels.each{|k,v| - @deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" - } - end - end - begin has_owner = !pod['metadata']['ownerReferences'].nil? owner_kind = '' @@ -129,7 +123,6 @@ def get_workload_name(pod) controller_name = pod['metadata']['name'] end namespace = pod['metadata']['namespace'] - workload_name = '' if owner_kind.nil? owner_kind = 'Pod' @@ -139,41 +132,22 @@ def get_workload_name(pod) # we are excluding jobs return nil when 'replicaset' - # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name - labels = pod['metadata']['labels'].to_h - labels.each {|k,v| - lookup_key = "#{namespace}-#{k}=#{v}" - if @deployment_lookup.key?(lookup_key) - workload_name = @deployment_lookup[lookup_key] - break - end - } - if workload_name.empty? - workload_name = "#{namespace}~~#{controller_name}" - end + #TODO: + workload_name = get_replica_set_owner_ref(controller_name) + workload_name = "#{namespace}~~#{workload_name}" when 'daemonset' workload_name = "#{namespace}~~#{controller_name}" else - workload_name = "#{namespace}~~#{pod['metadata']['name']}" + workload_name = "#{namespace}~~#{controller_name}" end return workload_name rescue => e - @log.info "Error in get_workload_name(pod) #{e.message}" + @log.info "Error in get_workload_name(pod) #{e.message} #{e.backtrace}" return nil end end def get_workload_kind(pod) - if @deployment_lookup.empty? - @deployment_inventory['items'].each do |deployment| - match_labels = deployment['spec']['selector']['matchLabels'].to_h - namespace = deployment['metadata']['namespace'] - match_labels.each{|k,v| - @deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" - } - end - end - begin has_owner = !pod['metadata']['ownerReferences'].nil? owner_kind = '' @@ -193,6 +167,30 @@ def get_workload_kind(pod) end end + private + def get_replica_set_owner_ref(controller_name) + if @workload_name_cache.key?(controller_name) + return @workload_name_cache[controller_name] + end + begin + owner_ref = controller_name + @replicaset_inventory['items'].each{|rs| + rs_name = rs['metadata']['name'] + if controller_name.casecmp(rs_name) == 0 + if !rs['metadata']['ownerReferences'].nil? + owner_ref = rs['metadata']['ownerReferences'][0]['name'] if rs['metadata']['ownerReferences'][0]['name'] + end + break + end + } + @workload_name_cache[controller_name] = owner_ref + return owner_ref + rescue => e + @log.info "Error in get_replica_set_owner_ref(controller_name) #{e.message}" + return controller_name + end + end + def get_node_capacity(node_name, type) if node_name.nil? #unscheduled pods will not have a node name return -1 diff --git a/source/code/plugin/health/health_missing_signal_generator.rb b/source/code/plugin/health/health_missing_signal_generator.rb index 1827a0190..84af81ea7 100644 --- a/source/code/plugin/health/health_missing_signal_generator.rb +++ b/source/code/plugin/health/health_missing_signal_generator.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require_relative 'health_model_constants' require_relative 'health_monitor_record' diff --git a/source/code/plugin/health/health_model_buffer.rb b/source/code/plugin/health/health_model_buffer.rb index 1ccfe7349..1c3ec3332 100644 --- a/source/code/plugin/health/health_model_buffer.rb +++ b/source/code/plugin/health/health_model_buffer.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module HealthModel =begin diff --git a/source/code/plugin/health/health_model_builder.rb b/source/code/plugin/health/health_model_builder.rb index 13813c8d9..43ed30d05 100644 --- a/source/code/plugin/health/health_model_builder.rb +++ b/source/code/plugin/health/health_model_builder.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require 'time' module HealthModel diff --git a/source/code/plugin/health/health_model_constants.rb b/source/code/plugin/health/health_model_constants.rb index 0922c7ff2..c74f86f4d 100644 --- a/source/code/plugin/health/health_model_constants.rb +++ b/source/code/plugin/health/health_model_constants.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true module HealthModel class MonitorState CRITICAL = "fail" diff --git a/source/code/plugin/health/health_model_definition_parser.rb b/source/code/plugin/health/health_model_definition_parser.rb index f6c7a781d..907bc1fd1 100644 --- a/source/code/plugin/health/health_model_definition_parser.rb +++ b/source/code/plugin/health/health_model_definition_parser.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true =begin Class to parse the health model definition. The definition expresses the relationship between monitors, how to roll up to an aggregate monitor, and what labels to "pass on" to the parent monitor diff --git a/source/code/plugin/health/health_monitor_helpers.rb b/source/code/plugin/health/health_monitor_helpers.rb index f784ae76e..74aa35af0 100644 --- a/source/code/plugin/health/health_monitor_helpers.rb +++ b/source/code/plugin/health/health_monitor_helpers.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require 'logger' require 'digest' require_relative 'health_model_constants' diff --git a/source/code/plugin/health/health_monitor_optimizer.rb b/source/code/plugin/health/health_monitor_optimizer.rb index b33c8a986..a63d59abf 100644 --- a/source/code/plugin/health/health_monitor_optimizer.rb +++ b/source/code/plugin/health/health_monitor_optimizer.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true module HealthModel class HealthMonitorOptimizer #ctor diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb index e75824268..b36c46370 100644 --- a/source/code/plugin/health/health_monitor_provider.rb +++ b/source/code/plugin/health/health_monitor_provider.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/health_monitor_record.rb b/source/code/plugin/health/health_monitor_record.rb index 873736c3a..7df84ff53 100644 --- a/source/code/plugin/health/health_monitor_record.rb +++ b/source/code/plugin/health/health_monitor_record.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true HealthMonitorRecord = Struct.new( :monitor_id, :monitor_instance_id, diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb index 8e2294cc9..16f8bedc4 100644 --- a/source/code/plugin/health/health_monitor_state.rb +++ b/source/code/plugin/health/health_monitor_state.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/health_monitor_telemetry.rb b/source/code/plugin/health/health_monitor_telemetry.rb index 4e80a5145..1227e1f83 100644 --- a/source/code/plugin/health/health_monitor_telemetry.rb +++ b/source/code/plugin/health/health_monitor_telemetry.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' require 'socket' if Socket.gethostname.start_with?('omsagent-rs') diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index e21fdc83d..0d297d215 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require 'logger' require 'digest' require_relative 'health_model_constants' @@ -73,59 +74,17 @@ def is_cluster_health_model_enabled end end - def get_pods_ready_hash(pod_inventory, deployment_inventory) + def get_pods_ready_hash(resources) pods_ready_percentage_hash = {} - deployment_lookup = {} - deployment_inventory['items'].each do |deployment| - match_labels = deployment['spec']['selector']['matchLabels'].to_h - namespace = deployment['metadata']['namespace'] - match_labels.each{|k,v| - deployment_lookup["#{namespace}-#{k}=#{v}"] = "#{deployment['metadata']['namespace']}~~#{deployment['metadata']['name']}" - } - end - pod_inventory['items'].each do |pod| + resources.pod_inventory['items'].each do |pod| begin - has_owner = !pod['metadata']['ownerReferences'].nil? - owner_kind = '' - if has_owner - owner_kind = pod['metadata']['ownerReferences'][0]['kind'] - controller_name = pod['metadata']['ownerReferences'][0]['name'] - else - owner_kind = pod['kind'] - controller_name = pod['metadata']['name'] - #log.info "#{JSON.pretty_generate(pod)}" - end - + workload_name = resources.get_workload_name(pod) namespace = pod['metadata']['namespace'] status = pod['status']['phase'] - - workload_name = '' - if owner_kind.nil? - owner_kind = 'Pod' - end - case owner_kind.downcase - when 'job' - # we are excluding jobs + owner_kind = resources.get_workload_kind(pod) + if owner_kind.casecmp('job') == 0 next - when 'replicaset' - # get the labels, and see if there is a match. If there is, it is the deployment. If not, use replica set name/controller name - labels = pod['metadata']['labels'].to_h - labels.each {|k,v| - lookup_key = "#{namespace}-#{k}=#{v}" - if deployment_lookup.key?(lookup_key) - workload_name = deployment_lookup[lookup_key] - break - end - } - if workload_name.empty? - workload_name = "#{namespace}~~#{controller_name}" - end - when 'daemonset' - workload_name = "#{namespace}~~#{controller_name}" - else - workload_name = "#{namespace}~~#{pod['metadata']['name']}" end - if pods_ready_percentage_hash.key?(workload_name) total_pods = pods_ready_percentage_hash[workload_name]['totalPods'] pods_ready = pods_ready_percentage_hash[workload_name]['podsReady'] @@ -141,7 +100,7 @@ def get_pods_ready_hash(pod_inventory, deployment_inventory) pods_ready_percentage_hash[workload_name] = {'totalPods' => total_pods, 'podsReady' => pods_ready, 'namespace' => namespace, 'workload_name' => workload_name, 'kind' => owner_kind} rescue => e - log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" + @log.info "Error when processing pod #{pod['metadata']['name']} #{e.message}" end end return pods_ready_percentage_hash @@ -152,30 +111,30 @@ def get_node_state_from_node_conditions(monitor_config, node_conditions) failtypes = ['outofdisk', 'networkunavailable'].to_set #default fail types if !monitor_config.nil? && !monitor_config["NodeConditionTypesForFailedState"].nil? failtypes = monitor_config["NodeConditionTypesForFailedState"] - if !failtypes.nil? - failtypes = failtypes.split(',').map{|x| x.downcase}.map{|x| x.gsub(" ","")}.to_set - end + if !failtypes.nil? + failtypes = failtypes.split(',').map{|x| x.downcase}.map{|x| x.gsub(" ","")}.to_set + end end - log = get_log_handle - #log.info "Fail Types #{failtypes.inspect}" + log = get_log_handle + #log.info "Fail Types #{failtypes.inspect}" node_conditions.each do |condition| type = condition['type'] status = condition['status'] #for each condition in the configuration, check if the type is not false. If yes, update state to fail if (failtypes.include?(type.downcase) && (status == 'True' || status == 'Unknown')) - return "fail" + return HealthMonitorStates::FAIL elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) - return "warn" + return HealthMonitorStates::WARNING elsif type == "Ready" && status == 'True' pass = true end end if pass - return "pass" + return HealthMonitorStates::PASS else - return "fail" + return HealthMonitorStates::FAIL end end diff --git a/source/code/plugin/health/health_signal_reducer.rb b/source/code/plugin/health/health_signal_reducer.rb index f92f24ac3..4708c4ee5 100644 --- a/source/code/plugin/health/health_signal_reducer.rb +++ b/source/code/plugin/health/health_signal_reducer.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel diff --git a/source/code/plugin/health/monitor_factory.rb b/source/code/plugin/health/monitor_factory.rb index 5f2c3945c..1e4f6f5b8 100644 --- a/source/code/plugin/health/monitor_factory.rb +++ b/source/code/plugin/health/monitor_factory.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'aggregate_monitor' require_relative 'unit_monitor' diff --git a/source/code/plugin/health/parent_monitor_provider.rb b/source/code/plugin/health/parent_monitor_provider.rb index 4ab6e6297..e5766ea1b 100644 --- a/source/code/plugin/health/parent_monitor_provider.rb +++ b/source/code/plugin/health/parent_monitor_provider.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' module HealthModel class ParentMonitorProvider diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb index 9af599321..6454007b6 100644 --- a/source/code/plugin/health/unit_monitor.rb +++ b/source/code/plugin/health/unit_monitor.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'health_model_constants' require 'json' diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 9a1b8f9a9..affbdd275 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -86,11 +86,11 @@ def enumerate node_inventory = JSON.parse(node_inventory_response.body) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") pod_inventory = JSON.parse(pod_inventory_response.body) - deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body) + replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_version: "extensions/v1beta1").body) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory - @resources.set_deployment_inventory(deployment_inventory) + @resources.set_replicaset_inventory(replicaset_inventory) @resources.build_pod_uid_lookup if node_inventory_response.code.to_i != 200 @@ -106,7 +106,7 @@ def enumerate health_monitor_records.push(record) if record record = process_memory_oversubscribed_monitor(pod_inventory, node_inventory) health_monitor_records.push(record) if record - pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(pod_inventory, deployment_inventory) + pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(@resources) system_pods = pods_ready_hash.select{|k,v| v['namespace'] == 'kube-system'} workload_pods = pods_ready_hash.select{|k,v| v['namespace'] != 'kube-system'} @@ -121,7 +121,7 @@ def enumerate health_monitor_records.push(record) if record end else - hmlog.info "POD INVENTORY IS NIL" + @@hmlog.info "POD INVENTORY IS NIL" end if !node_inventory.nil? @@ -130,7 +130,7 @@ def enumerate health_monitor_records.push(record) if record end else - hmlog.info "NODE INVENTORY IS NIL" + @@hmlog.info "NODE INVENTORY IS NIL" end health_monitor_records.each do |record| @@ -260,14 +260,14 @@ def process_node_condition_monitor(node_inventory) node_inventory['items'].each do |node| node_name = node['metadata']['name'] conditions = node['status']['conditions'] - state = HealthMonitorUtils.get_node_state_from_node_conditions(monitor_config, conditions) + node_state = HealthMonitorUtils.get_node_state_from_node_conditions(monitor_config, conditions) details = {} conditions.each do |condition| - state = !(condition['status'].downcase == 'true' && condition['type'].downcase != 'ready') ? HealthMonitorStates::PASS : HealthMonitorStates::FAIL - details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message'], "State" => state} + condition_state = !(condition['status'].downcase == 'true' && condition['type'].downcase != 'ready') ? HealthMonitorStates::PASS : HealthMonitorStates::FAIL + details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message'], "State" => condition_state} #@@hmlog.info "Node Condition details: #{JSON.pretty_generate(details)}" end - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} + health_monitor_record = {"timestamp" => timestamp, "state" => node_state, "details" => details} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id, node_name]) health_record = {} time_now = Time.now.utc.iso8601 @@ -291,11 +291,11 @@ def initialize_inventory node_inventory = JSON.parse(node_inventory_response.body) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") pod_inventory = JSON.parse(pod_inventory_response.body) - deployment_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("deployments", api_version: "extensions/v1beta1").body) + replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_version: "extensions/v1beta1").body) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory - @resources.set_deployment_inventory(deployment_inventory) + @resources.set_replicaset_inventory(replicaset_inventory) @resources.build_pod_uid_lookup end diff --git a/source/code/plugin/out_health_forward.rb b/source/code/plugin/out_health_forward.rb index 18664a22a..6fcfe368b 100644 --- a/source/code/plugin/out_health_forward.rb +++ b/source/code/plugin/out_health_forward.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true # # Fluentd # diff --git a/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb b/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb index 074878fe2..6972916bf 100644 --- a/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb +++ b/test/code/plugin/health/health_container_cpu_memory_aggregator_spec.rb @@ -25,7 +25,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { @@ -60,7 +60,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { @@ -113,7 +113,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { @@ -163,7 +163,7 @@ resources.pod_inventory = pods resources.node_inventory = nodes - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) resources.build_pod_uid_lookup #call this in in_kube_health every min cluster_labels = { diff --git a/test/code/plugin/health/health_kubernetes_resource_spec.rb b/test/code/plugin/health/health_kubernetes_resource_spec.rb index dbeec4858..f4daedace 100644 --- a/test/code/plugin/health/health_kubernetes_resource_spec.rb +++ b/test/code/plugin/health/health_kubernetes_resource_spec.rb @@ -207,7 +207,7 @@ resources = HealthKubernetesResources.instance resources.node_inventory = nodes resources.pod_inventory = pods - resources.set_deployment_inventory(deployments) + resources.set_replicaset_inventory(deployments) #act parsed_nodes = resources.get_nodes parsed_workloads = resources.get_workload_names @@ -217,28 +217,6 @@ assert_equal parsed_workloads.size, 3 assert_equal parsed_nodes, ['aks-nodepool1-19574989-0', 'aks-nodepool1-19574989-1'] - parsed_workloads.sort.must_equal ['default~~diliprdeploymentnodeapps', 'default~~rss-site', 'kube-system~~kube-proxy'].sort + parsed_workloads.sort.must_equal ['default~~diliprdeploymentnodeapps-c4fdfb446', 'default~~rss-site', 'kube-system~~kube-proxy'].sort end - - # it 'builds the pod_uid lookup correctly' do - # #arrange - # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json') - # nodes = JSON.parse(f) - # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json') - # pods = JSON.parse(f) - # f = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json') - # deployments = JSON.parse(f) - - # resources = HealthKubernetesResources.instance - - # resources.node_inventory = nodes - # resources.pod_inventory = pods - # resources.set_deployment_inventory(deployments) #resets deployment_lookup -- this was causing Unit test failures - - # resources.build_pod_uid_lookup - - # resources.pod_uid_lookup - # resources.workload_container_count - - # end end \ No newline at end of file diff --git a/test/code/plugin/health/health_model_builder_test.rb b/test/code/plugin/health/health_model_builder_test.rb index a7c5e0927..3015ae55f 100644 --- a/test/code/plugin/health/health_model_builder_test.rb +++ b/test/code/plugin/health/health_model_builder_test.rb @@ -7,489 +7,510 @@ class FilterHealthModelBuilderTest < Test::Unit::TestCase include HealthModel - def test_event_stream - #setup - health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') - health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) - monitor_factory = MonitorFactory.new - hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) - # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side - state_finalizers = [AggregateMonitorStateFinalizer.new] - monitor_set = MonitorSet.new - model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) - - nodes_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", - "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - } - - pods_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", - "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - } - - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', - 'container.azm.ms/cluster-name' => 'dilipr-health-test' - } - - cluster_id = 'fake_cluster_id' - - #test - state = HealthMonitorState.new() - generator = HealthMissingSignalGenerator.new - - for scenario in ["first", "second", "third"] - mock_data_path = File.join(__dir__, "../../../../health_records/#{scenario}_daemon_set_signals.json") - file = File.read(mock_data_path) - records = JSON.parse(file) - - node_inventory = JSON.parse(File.read(nodes_file_map[scenario])) - pod_inventory = JSON.parse(File.read(pods_file_map[scenario])) - deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/deployments.json"))) - resources = HealthKubernetesResources.instance - resources.node_inventory = node_inventory - resources.pod_inventory = pod_inventory - resources.set_deployment_inventory(deployment_inventory) - - workload_names = resources.get_workload_names - provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) - - health_monitor_records = [] - records.each do |record| - monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] - monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] - health_monitor_record = HealthMonitorRecord.new( - record[HealthMonitorRecordFields::MONITOR_ID], - record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], - record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], - record[HealthMonitorRecordFields::DETAILS]["state"], - provider.get_labels(record), - provider.get_config(monitor_id), - record[HealthMonitorRecordFields::DETAILS] - ) - - state.update_state(health_monitor_record, - provider.get_config(health_monitor_record.monitor_id) - ) - - # get the health state based on the monitor's operational state - # update state calls updates the state of the monitor based on configuration and history of the the monitor records - health_monitor_record.state = state.get_state(monitor_instance_id).new_state - health_monitor_records.push(health_monitor_record) - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - end - - - #handle kube api down - kube_api_down_handler = HealthKubeApiDownHandler.new - health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) - - # Dedupe daemonset signals - # Remove unit monitor signals for “gone” objects - reducer = HealthSignalReducer.new() - reduced_records = reducer.reduce_signals(health_monitor_records, resources) - - cluster_id = 'fake_cluster_id' - - #get the list of 'none' and 'unknown' signals - missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) - #update state for missing signals - missing_signals.each{|signal| - state.update_state(signal, - provider.get_config(signal.monitor_id) - ) - } - generator.update_last_received_records(reduced_records) - reduced_records.push(*missing_signals) - - # build the health model - all_records = reduced_records - model_builder.process_records(all_records) - all_monitors = model_builder.finalize_model - - # update the state for aggregate monitors (unit monitors are updated above) - all_monitors.each{|monitor_instance_id, monitor| - if monitor.is_aggregate_monitor - state.update_state(monitor, - provider.get_config(monitor.monitor_id) - ) - end - - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - should_send = instance_state.should_send - - # always send cluster monitor as a heartbeat - if !should_send && monitor_instance_id != MonitorId::CLUSTER - all_monitors.delete(monitor_instance_id) - end - } - - records_to_send = [] - all_monitors.keys.each{|key| - record = provider.get_record(all_monitors[key], state) - #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" - } - - if scenario == "first" - assert_equal 50, all_monitors.size - elsif scenario == "second" - assert_equal 34, all_monitors.size - elsif scenario == "third" - assert_equal 5, all_monitors.size - end - # for each key in monitor.keys, - # get the state from health_monitor_state - # generate the record to send - serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) - serializer.serialize(state) - - deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) - deserialized_state = deserializer.deserialize - - after_state = HealthMonitorState.new - after_state.initialize_state(deserialized_state) - end - end - - def test_event_stream_aks_engine - - #setup - health_definition_path = File.join(__dir__, '../../../../installer\conf\health_model_definition.json') - health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) - monitor_factory = MonitorFactory.new - hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) - state_finalizers = [AggregateMonitorStateFinalizer.new] - monitor_set = MonitorSet.new - model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) - - nodes_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", - #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", - "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", - "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", - "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", - } - - pods_file_map = { - #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", - #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", - "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", - "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", - "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", - } - - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'aks-engine-health', - 'container.azm.ms/cluster-name' => 'aks-engine-health' - } - - cluster_id = 'fake_cluster_id' - - #test - state = HealthMonitorState.new() - generator = HealthMissingSignalGenerator.new - - for scenario in 1..3 - mock_data_path = File.join(__dir__, "../../../../health_records/aks-engine/aks-engine-#{scenario}.json") - file = File.read(mock_data_path) - records = JSON.parse(file) - - node_inventory = JSON.parse(File.read(nodes_file_map["aks-engine-#{scenario}"])) - pod_inventory = JSON.parse(File.read(pods_file_map["aks-engine-#{scenario}"])) - deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/aks-engine/deployments.json"))) - resources = HealthKubernetesResources.instance - resources.node_inventory = node_inventory - resources.pod_inventory = pod_inventory - resources.deployment_inventory = deployment_inventory - - workload_names = resources.get_workload_names - provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) - - health_monitor_records = [] - records.each do |record| - monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] - monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] - health_monitor_record = HealthMonitorRecord.new( - record[HealthMonitorRecordFields::MONITOR_ID], - record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], - record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], - record[HealthMonitorRecordFields::DETAILS]["state"], - provider.get_labels(record), - provider.get_config(monitor_id), - record[HealthMonitorRecordFields::DETAILS] - ) - - state.update_state(health_monitor_record, - provider.get_config(health_monitor_record.monitor_id) - ) - - # get the health state based on the monitor's operational state - # update state calls updates the state of the monitor based on configuration and history of the the monitor records - health_monitor_record.state = state.get_state(monitor_instance_id).new_state - health_monitor_records.push(health_monitor_record) - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - end - - - #handle kube api down - kube_api_down_handler = HealthKubeApiDownHandler.new - health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) - - # Dedupe daemonset signals - # Remove unit monitor signals for “gone” objects - reducer = HealthSignalReducer.new() - reduced_records = reducer.reduce_signals(health_monitor_records, resources) - - cluster_id = 'fake_cluster_id' - - #get the list of 'none' and 'unknown' signals - missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) - #update state for missing signals - missing_signals.each{|signal| - state.update_state(signal, - provider.get_config(signal.monitor_id) - ) - } - generator.update_last_received_records(reduced_records) - reduced_records.push(*missing_signals) - - # build the health model - all_records = reduced_records - model_builder.process_records(all_records) - all_monitors = model_builder.finalize_model - - # update the state for aggregate monitors (unit monitors are updated above) - all_monitors.each{|monitor_instance_id, monitor| - if monitor.is_aggregate_monitor - state.update_state(monitor, - provider.get_config(monitor.monitor_id) - ) - end - - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - should_send = instance_state.should_send - - # always send cluster monitor as a heartbeat - if !should_send && monitor_instance_id != MonitorId::CLUSTER - all_monitors.delete(monitor_instance_id) - end - } - - records_to_send = [] - all_monitors.keys.each{|key| - record = provider.get_record(all_monitors[key], state) - #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" - } - - if scenario == 1 - assert_equal 58, all_monitors.size - elsif scenario == 2 - assert_equal 37, all_monitors.size - elsif scenario == 3 - assert_equal 6, all_monitors.size - end - # for each key in monitor.keys, - # get the state from health_monitor_state - # generate the record to send - serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) - serializer.serialize(state) - - deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) - deserialized_state = deserializer.deserialize - - after_state = HealthMonitorState.new - after_state.initialize_state(deserialized_state) - end - end - - def test_container_memory_cpu_with_model - health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') - health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) - monitor_factory = MonitorFactory.new - hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) - # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side - state_finalizers = [AggregateMonitorStateFinalizer.new] - monitor_set = MonitorSet.new - model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) - - nodes_file_map = { - "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", - "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", - "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", - } - - pods_file_map = { - "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", - "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", - "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", - } - - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', - 'container.azm.ms/cluster-name' => 'dilipr-health-test' - } - - cluster_id = 'fake_cluster_id' - - #test - state = HealthMonitorState.new() - generator = HealthMissingSignalGenerator.new - - mock_data_path = "C:/Users/dilipr/desktop/health/container_cpu_memory/daemonset.json" - file = File.read(mock_data_path) - records = JSON.parse(file) - - node_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json")) - pod_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json")) - deployment_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json")) + # def test_event_stream + # #setup + # health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') + # health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + # monitor_factory = MonitorFactory.new + # hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + # state_finalizers = [AggregateMonitorStateFinalizer.new] + # monitor_set = MonitorSet.new + # model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + # nodes_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + # "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # } + + # pods_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + # "first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # "second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # "third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # } + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + # 'container.azm.ms/cluster-name' => 'dilipr-health-test' + # } + + # cluster_id = 'fake_cluster_id' + + # #test + # state = HealthMonitorState.new() + # generator = HealthMissingSignalGenerator.new + + # for scenario in ["first", "second", "third"] + # mock_data_path = File.join(__dir__, "../../../../health_records/#{scenario}_daemon_set_signals.json") + # file = File.read(mock_data_path) + # records = JSON.parse(file) + + # node_inventory = JSON.parse(File.read(nodes_file_map[scenario])) + # pod_inventory = JSON.parse(File.read(pods_file_map[scenario])) + # deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/deployments.json"))) + # resources = HealthKubernetesResources.instance + # resources.node_inventory = node_inventory + # resources.pod_inventory = pod_inventory + # resources.set_replicaset_inventory(deployment_inventory) + + # workload_names = resources.get_workload_names + # provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + # health_monitor_records = [] + # records.each do |record| + # monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + # monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + # health_monitor_record = HealthMonitorRecord.new( + # record[HealthMonitorRecordFields::MONITOR_ID], + # record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + # record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + # record[HealthMonitorRecordFields::DETAILS]["state"], + # provider.get_labels(record), + # provider.get_config(monitor_id), + # record[HealthMonitorRecordFields::DETAILS] + # ) + + # state.update_state(health_monitor_record, + # provider.get_config(health_monitor_record.monitor_id) + # ) + + # # get the health state based on the monitor's operational state + # # update state calls updates the state of the monitor based on configuration and history of the the monitor records + # health_monitor_record.state = state.get_state(monitor_instance_id).new_state + # health_monitor_records.push(health_monitor_record) + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # end + + + # #handle kube api down + # kube_api_down_handler = HealthKubeApiDownHandler.new + # health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # # Dedupe daemonset signals + # # Remove unit monitor signals for “gone” objects + # reducer = HealthSignalReducer.new() + # reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + # cluster_id = 'fake_cluster_id' + + # #get the list of 'none' and 'unknown' signals + # missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + # #update state for missing signals + # missing_signals.each{|signal| + # state.update_state(signal, + # provider.get_config(signal.monitor_id) + # ) + # } + # generator.update_last_received_records(reduced_records) + # reduced_records.push(*missing_signals) + + # # build the health model + # all_records = reduced_records + # model_builder.process_records(all_records) + # all_monitors = model_builder.finalize_model + + # # update the state for aggregate monitors (unit monitors are updated above) + # all_monitors.each{|monitor_instance_id, monitor| + # if monitor.is_aggregate_monitor + # state.update_state(monitor, + # provider.get_config(monitor.monitor_id) + # ) + # end + + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # should_send = instance_state.should_send + + # # always send cluster monitor as a heartbeat + # if !should_send && monitor_instance_id != MonitorId::CLUSTER + # all_monitors.delete(monitor_instance_id) + # end + # } + + # records_to_send = [] + # all_monitors.keys.each{|key| + # record = provider.get_record(all_monitors[key], state) + # #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + # } + + # if scenario == "first" + # assert_equal 50, all_monitors.size + # elsif scenario == "second" + # assert_equal 34, all_monitors.size + # elsif scenario == "third" + # assert_equal 5, all_monitors.size + # end + # # for each key in monitor.keys, + # # get the state from health_monitor_state + # # generate the record to send + # serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + # serializer.serialize(state) + + # deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state.json')) + # deserialized_state = deserializer.deserialize + + # after_state = HealthMonitorState.new + # after_state.initialize_state(deserialized_state) + # end + # end + + # def test_event_stream_aks_engine + + # #setup + # health_definition_path = File.join(__dir__, '../../../../installer\conf\health_model_definition.json') + # health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + # monitor_factory = MonitorFactory.new + # hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # state_finalizers = [AggregateMonitorStateFinalizer.new] + # monitor_set = MonitorSet.new + # model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + # nodes_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_nodes.json", + # #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/nodes.json", + # "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + # "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + # "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/nodes.json", + # } + + # pods_file_map = { + # #"extra" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/extra_pods.json", + # #"first" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"first-nosecondnode" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"second" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"third" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"fourth" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"missing" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # #"kube_api_down" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/pods.json", + # "aks-engine-1" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + # "aks-engine-2" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + # "aks-engine-3" => "C:/AzureMonitor/ContainerInsights/Docker-Provider/inventory/aks-engine/pods.json", + # } + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'aks-engine-health', + # 'container.azm.ms/cluster-name' => 'aks-engine-health' + # } + + # cluster_id = 'fake_cluster_id' + + # #test + # state = HealthMonitorState.new() + # generator = HealthMissingSignalGenerator.new + + # for scenario in 1..3 + # mock_data_path = File.join(__dir__, "../../../../health_records/aks-engine/aks-engine-#{scenario}.json") + # file = File.read(mock_data_path) + # records = JSON.parse(file) + + # node_inventory = JSON.parse(File.read(nodes_file_map["aks-engine-#{scenario}"])) + # pod_inventory = JSON.parse(File.read(pods_file_map["aks-engine-#{scenario}"])) + # deployment_inventory = JSON.parse(File.read(File.join(__dir__, "../../../../inventory/aks-engine/deployments.json"))) + # resources = HealthKubernetesResources.instance + # resources.node_inventory = node_inventory + # resources.pod_inventory = pod_inventory + # resources.deployment_inventory = deployment_inventory + + # workload_names = resources.get_workload_names + # provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + # health_monitor_records = [] + # records.each do |record| + # monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + # monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + # health_monitor_record = HealthMonitorRecord.new( + # record[HealthMonitorRecordFields::MONITOR_ID], + # record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + # record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + # record[HealthMonitorRecordFields::DETAILS]["state"], + # provider.get_labels(record), + # provider.get_config(monitor_id), + # record[HealthMonitorRecordFields::DETAILS] + # ) + + # state.update_state(health_monitor_record, + # provider.get_config(health_monitor_record.monitor_id) + # ) + + # # get the health state based on the monitor's operational state + # # update state calls updates the state of the monitor based on configuration and history of the the monitor records + # health_monitor_record.state = state.get_state(monitor_instance_id).new_state + # health_monitor_records.push(health_monitor_record) + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # end + + + # #handle kube api down + # kube_api_down_handler = HealthKubeApiDownHandler.new + # health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # # Dedupe daemonset signals + # # Remove unit monitor signals for “gone” objects + # reducer = HealthSignalReducer.new() + # reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + # cluster_id = 'fake_cluster_id' + + # #get the list of 'none' and 'unknown' signals + # missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + # #update state for missing signals + # missing_signals.each{|signal| + # state.update_state(signal, + # provider.get_config(signal.monitor_id) + # ) + # } + # generator.update_last_received_records(reduced_records) + # reduced_records.push(*missing_signals) + + # # build the health model + # all_records = reduced_records + # model_builder.process_records(all_records) + # all_monitors = model_builder.finalize_model + + # # update the state for aggregate monitors (unit monitors are updated above) + # all_monitors.each{|monitor_instance_id, monitor| + # if monitor.is_aggregate_monitor + # state.update_state(monitor, + # provider.get_config(monitor.monitor_id) + # ) + # end + + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # should_send = instance_state.should_send + + # # always send cluster monitor as a heartbeat + # if !should_send && monitor_instance_id != MonitorId::CLUSTER + # all_monitors.delete(monitor_instance_id) + # end + # } + + # records_to_send = [] + # all_monitors.keys.each{|key| + # record = provider.get_record(all_monitors[key], state) + # #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + # } + + # if scenario == 1 + # assert_equal 58, all_monitors.size + # elsif scenario == 2 + # assert_equal 37, all_monitors.size + # elsif scenario == 3 + # assert_equal 6, all_monitors.size + # end + # # for each key in monitor.keys, + # # get the state from health_monitor_state + # # generate the record to send + # serializer = HealthStateSerializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + # serializer.serialize(state) + + # deserializer = HealthStateDeserializer.new(File.join(__dir__, '../../../../health_records\health_model_state_aks-engine.json')) + # deserialized_state = deserializer.deserialize + + # after_state = HealthMonitorState.new + # after_state.initialize_state(deserialized_state) + # end + # end + + # def test_container_memory_cpu_with_model + # health_definition_path = File.join(__dir__, '../../../../installer/conf/health_model_definition.json') + # health_model_definition = ParentMonitorProvider.new(HealthModelDefinitionParser.new(health_definition_path).parse_file) + # monitor_factory = MonitorFactory.new + # hierarchy_builder = HealthHierarchyBuilder.new(health_model_definition, monitor_factory) + # # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + # state_finalizers = [AggregateMonitorStateFinalizer.new] + # monitor_set = MonitorSet.new + # model_builder = HealthModelBuilder.new(hierarchy_builder, state_finalizers, monitor_set) + + # nodes_file_map = { + # "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + # "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + # "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json", + # } + + # pods_file_map = { + # "first" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + # "second" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + # "third" => "C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json", + # } + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + # 'container.azm.ms/cluster-name' => 'dilipr-health-test' + # } + + # cluster_id = 'fake_cluster_id' + + # #test + # state = HealthMonitorState.new() + # generator = HealthMissingSignalGenerator.new + + # mock_data_path = "C:/Users/dilipr/desktop/health/container_cpu_memory/daemonset.json" + # file = File.read(mock_data_path) + # records = JSON.parse(file) + + # node_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/nodes.json")) + # pod_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/pods.json")) + # deployment_inventory = JSON.parse(File.read("C:/Users/dilipr/desktop/health/container_cpu_memory/deployments.json")) + # resources = HealthKubernetesResources.instance + # resources.node_inventory = node_inventory + # resources.pod_inventory = pod_inventory + # resources.set_replicaset_inventory(deployment_inventory) + + # workload_names = resources.get_workload_names + # provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) + + + # #container memory cpu records + # file = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/cadvisor_perf.json') + # cadvisor_records = JSON.parse(file) + # cadvisor_records = cadvisor_records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} + # formatted_records = [] + # formatter = HealthContainerCpuMemoryRecordFormatter.new + # cadvisor_records.each{|record| + # formatted_record = formatter.get_record_from_cadvisor_record(record) + # formatted_records.push(formatted_record) + # } + + # resources.build_pod_uid_lookup #call this in in_kube_health every min + + # cluster_labels = { + # 'container.azm.ms/cluster-region' => 'eastus', + # 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', + # 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', + # 'container.azm.ms/cluster-name' => 'dilipr-health-test' + # } + + # cluster_id = 'fake_cluster_id' + + # aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) + # deduped_records = aggregator.dedupe_records(formatted_records) + # aggregator.aggregate(deduped_records) + # aggregator.compute_state + # container_cpu_memory_records = aggregator.get_records + + # records.concat(container_cpu_memory_records) + + # health_monitor_records = [] + # records.each do |record| + # monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + # monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + # health_monitor_record = HealthMonitorRecord.new( + # record[HealthMonitorRecordFields::MONITOR_ID], + # record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + # record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + # record[HealthMonitorRecordFields::DETAILS]["state"], + # provider.get_labels(record), + # provider.get_config(monitor_id), + # record[HealthMonitorRecordFields::DETAILS] + # ) + + # state.update_state(health_monitor_record, + # provider.get_config(health_monitor_record.monitor_id) + # ) + + # # get the health state based on the monitor's operational state + # # update state calls updates the state of the monitor based on configuration and history of the the monitor records + # health_monitor_record.state = state.get_state(monitor_instance_id).new_state + # health_monitor_records.push(health_monitor_record) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # end + + # #handle kube api down + # kube_api_down_handler = HealthKubeApiDownHandler.new + # health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) + + # # Dedupe daemonset signals + # # Remove unit monitor signals for “gone” objects + # reducer = HealthSignalReducer.new() + # reduced_records = reducer.reduce_signals(health_monitor_records, resources) + + # cluster_id = 'fake_cluster_id' + + # #get the list of 'none' and 'unknown' signals + # missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) + # #update state for missing signals + # missing_signals.each{|signal| + # state.update_state(signal, + # provider.get_config(signal.monitor_id) + # ) + # } + # generator.update_last_received_records(reduced_records) + # reduced_records.push(*missing_signals) + + # # build the health model + # all_records = reduced_records + # model_builder.process_records(all_records) + # all_monitors = model_builder.finalize_model + + # # update the state for aggregate monitors (unit monitors are updated above) + # all_monitors.each{|monitor_instance_id, monitor| + # if monitor.is_aggregate_monitor + # state.update_state(monitor, + # provider.get_config(monitor.monitor_id) + # ) + # end + + # instance_state = state.get_state(monitor_instance_id) + # #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + # should_send = instance_state.should_send + + # # always send cluster monitor as a heartbeat + # if !should_send && monitor_instance_id != MonitorId::CLUSTER + # all_monitors.delete(monitor_instance_id) + # end + # } + + # records_to_send = [] + # all_monitors.keys.each{|key| + # record = provider.get_record(all_monitors[key], state) + # #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" + # } + # end + + def test_get_workload_name + # node_inventory = JSON.parse(File.read("C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/dilipr-health-test-nodes.json")) + # pod_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/dilipr-health-test-pods.json')) + # replicaset_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/dilipr-health-test-rs.json')) + node_inventory = JSON.parse(File.read("C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/jobyaks2-nodes.json")) + pod_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/jobyaks2-pods.json')) + replicaset_inventory = JSON.parse(File.read('C:/AzureMonitor/ContainerInsights/Docker-Provider/test/code/plugin/health/jobyaks2-rs.json')) resources = HealthKubernetesResources.instance resources.node_inventory = node_inventory resources.pod_inventory = pod_inventory - resources.set_deployment_inventory(deployment_inventory) - - workload_names = resources.get_workload_names - provider = HealthMonitorProvider.new(cluster_id, cluster_labels, resources, File.join(__dir__, "../../../../installer/conf/healthmonitorconfig.json")) - - - #container memory cpu records - file = File.read('C:/Users/dilipr/desktop/health/container_cpu_memory/cadvisor_perf.json') - cadvisor_records = JSON.parse(file) - cadvisor_records = cadvisor_records.select{|record| record['DataItems'][0]['ObjectName'] == 'K8SContainer'} - formatted_records = [] - formatter = HealthContainerCpuMemoryRecordFormatter.new - cadvisor_records.each{|record| - formatted_record = formatter.get_record_from_cadvisor_record(record) - formatted_records.push(formatted_record) + resources.set_replicaset_inventory(replicaset_inventory) + pod_inventory['items'].each{|pod| + workload_name = resources.get_workload_name(pod) + puts "POD #{pod['metadata']['name']} Workload Name #{workload_name}" } - resources.build_pod_uid_lookup #call this in in_kube_health every min + pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(resources) - cluster_labels = { - 'container.azm.ms/cluster-region' => 'eastus', - 'container.azm.ms/cluster-subscription-id' => '72c8e8ca-dc16-47dc-b65c-6b5875eb600a', - 'container.azm.ms/cluster-resource-group' => 'dilipr-health-test', - 'container.azm.ms/cluster-name' => 'dilipr-health-test' - } - - cluster_id = 'fake_cluster_id' - - aggregator = HealthContainerCpuMemoryAggregator.new(resources, provider) - deduped_records = aggregator.dedupe_records(formatted_records) - aggregator.aggregate(deduped_records) - aggregator.compute_state - container_cpu_memory_records = aggregator.get_records - - records.concat(container_cpu_memory_records) - - health_monitor_records = [] - records.each do |record| - monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] - monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] - health_monitor_record = HealthMonitorRecord.new( - record[HealthMonitorRecordFields::MONITOR_ID], - record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], - record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], - record[HealthMonitorRecordFields::DETAILS]["state"], - provider.get_labels(record), - provider.get_config(monitor_id), - record[HealthMonitorRecordFields::DETAILS] - ) - - state.update_state(health_monitor_record, - provider.get_config(health_monitor_record.monitor_id) - ) - - # get the health state based on the monitor's operational state - # update state calls updates the state of the monitor based on configuration and history of the the monitor records - health_monitor_record.state = state.get_state(monitor_instance_id).new_state - health_monitor_records.push(health_monitor_record) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - end - - #handle kube api down - kube_api_down_handler = HealthKubeApiDownHandler.new - health_monitor_records = kube_api_down_handler.handle_kube_api_down(health_monitor_records) - - # Dedupe daemonset signals - # Remove unit monitor signals for “gone” objects - reducer = HealthSignalReducer.new() - reduced_records = reducer.reduce_signals(health_monitor_records, resources) - - cluster_id = 'fake_cluster_id' - - #get the list of 'none' and 'unknown' signals - missing_signals = generator.get_missing_signals(cluster_id, reduced_records, resources, provider) - #update state for missing signals - missing_signals.each{|signal| - state.update_state(signal, - provider.get_config(signal.monitor_id) - ) - } - generator.update_last_received_records(reduced_records) - reduced_records.push(*missing_signals) - - # build the health model - all_records = reduced_records - model_builder.process_records(all_records) - all_monitors = model_builder.finalize_model - - # update the state for aggregate monitors (unit monitors are updated above) - all_monitors.each{|monitor_instance_id, monitor| - if monitor.is_aggregate_monitor - state.update_state(monitor, - provider.get_config(monitor.monitor_id) - ) - end - - instance_state = state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - should_send = instance_state.should_send - - # always send cluster monitor as a heartbeat - if !should_send && monitor_instance_id != MonitorId::CLUSTER - all_monitors.delete(monitor_instance_id) - end - } - - records_to_send = [] - all_monitors.keys.each{|key| - record = provider.get_record(all_monitors[key], state) - #puts "#{record["MonitorInstanceId"]} #{record["OldState"]} #{record["NewState"]}" - } + puts JSON.pretty_generate(pods_ready_hash) end end \ No newline at end of file From 22bd43da20fadd862ec52e279dc250abeec161d5 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 4 Nov 2019 18:22:43 -0800 Subject: [PATCH 136/160] Fix the bug where if a warning condition appears before fail condition, the node condition is reported as warning instead of fail. Also fix the node conditions state to consider unknown as a failure state (#292) --- source/code/plugin/health/health_monitor_utils.rb | 14 +++++++++----- source/code/plugin/in_kube_health.rb | 12 ++++++++++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index 0d297d215..2fa2d3a52 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -108,6 +108,8 @@ def get_pods_ready_hash(resources) def get_node_state_from_node_conditions(monitor_config, node_conditions) pass = false + warn = false + fail = false failtypes = ['outofdisk', 'networkunavailable'].to_set #default fail types if !monitor_config.nil? && !monitor_config["NodeConditionTypesForFailedState"].nil? failtypes = monitor_config["NodeConditionTypesForFailedState"] @@ -123,18 +125,20 @@ def get_node_state_from_node_conditions(monitor_config, node_conditions) #for each condition in the configuration, check if the type is not false. If yes, update state to fail if (failtypes.include?(type.downcase) && (status == 'True' || status == 'Unknown')) - return HealthMonitorStates::FAIL + fail = true elsif ((type == "DiskPressure" || type == "MemoryPressure" || type == "PIDPressure") && (status == 'True' || status == 'Unknown')) - return HealthMonitorStates::WARNING + warn = true elsif type == "Ready" && status == 'True' pass = true end end - if pass - return HealthMonitorStates::PASS - else + if fail return HealthMonitorStates::FAIL + elsif warn + return HealthMonitorStates::WARNING + else + return HealthMonitorStates::PASS end end diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index affbdd275..51ffa86d5 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -263,9 +263,17 @@ def process_node_condition_monitor(node_inventory) node_state = HealthMonitorUtils.get_node_state_from_node_conditions(monitor_config, conditions) details = {} conditions.each do |condition| - condition_state = !(condition['status'].downcase == 'true' && condition['type'].downcase != 'ready') ? HealthMonitorStates::PASS : HealthMonitorStates::FAIL + condition_state = HealthMonitorStates::PASS + if condition['type'].downcase != 'ready' + if (condition['status'].downcase == 'true' || condition['status'].downcase == 'unknown') + condition_state = HealthMonitorStates::FAIL + end + else #Condition == READY + if condition['status'].downcase != 'true' + condition_state = HealthMonitorStates::FAIL + end + end details[condition['type']] = {"Reason" => condition['reason'], "Message" => condition['message'], "State" => condition_state} - #@@hmlog.info "Node Condition details: #{JSON.pretty_generate(details)}" end health_monitor_record = {"timestamp" => timestamp, "state" => node_state, "details" => details} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id, node_name]) From 40f47a9b4f16ca049857243c00ee2a455904601f Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 5 Nov 2019 15:07:09 -0800 Subject: [PATCH 137/160] Fix for Nodes Aspect not showing up in draft cluster (#294) --- source/code/plugin/health/health_model_definition_parser.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/code/plugin/health/health_model_definition_parser.rb b/source/code/plugin/health/health_model_definition_parser.rb index 907bc1fd1..91f8cd24f 100644 --- a/source/code/plugin/health/health_model_definition_parser.rb +++ b/source/code/plugin/health/health_model_definition_parser.rb @@ -29,6 +29,7 @@ def parse_file labels = entry['labels'] if entry['labels'] aggregation_algorithm = entry['aggregation_algorithm'] if entry['aggregation_algorithm'] aggregation_algorithm_params = entry['aggregation_algorithm_params'] if entry['aggregation_algorithm_params'] + default_parent_monitor_id = entry['default_parent_monitor_id'] if entry['default_parent_monitor_id'] if parent_monitor_id.is_a?(Array) conditions = [] parent_monitor_id.each{|condition| @@ -38,7 +39,7 @@ def parse_file parent_id = condition['id'] conditions.push({"key" => key, "operator" => operator, "value" => value, "parent_id" => parent_id}) } - @health_model_definition[monitor_id] = {"conditions" => conditions, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} + @health_model_definition[monitor_id] = {"conditions" => conditions, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params, "default_parent_monitor_id" => default_parent_monitor_id} elsif parent_monitor_id.is_a?(String) @health_model_definition[monitor_id] = {"parent_monitor_id" => parent_monitor_id, "labels" => labels, "aggregation_algorithm" => aggregation_algorithm, "aggregation_algorithm_params" =>aggregation_algorithm_params} elsif parent_monitor_id.nil? From 16055bed2d7ac755301f5023bb9be28318690ed3 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 5 Nov 2019 17:16:02 -0800 Subject: [PATCH 138/160] Fix the issue where the health tree is inconsistent if a deployment is deleted (#295) --- .../health_container_cpu_memory_aggregator.rb | 92 ++++++++++++++++++- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index f6b57e0ae..6d69e0213 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -49,6 +49,9 @@ class HealthContainerCpuMemoryAggregator @@limit_is_array_event_sent = {} @@WORKLOAD_CONTAINER_COUNT_EMPTY_EVENT = "WorkloadContainerCountEmptyEvent" @@LIMIT_IS_ARRAY_EVENT = "ResourceLimitIsAnArrayEvent" + @@cpu_last_sent_monitors = {} + @@memory_last_sent_monitors = {} + def initialize(resources, provider) @pod_uid_lookup = resources.get_pod_uid_lookup @workload_container_count = resources.get_workload_container_count @@ -137,7 +140,6 @@ def aggregate(container_records) end container_instance_record = {} - pod_name = @pod_uid_lookup[lookup_key]["pod_name"] #append the record to the hash # append only if the record is not a duplicate record @@ -160,13 +162,14 @@ def compute_state() # if limits not set, set state to warning # if all records present, sort in descending order of metric, compute index based on StateThresholdPercentage, get the state (pass/fail/warn) based on monitor state (Using [Fail/Warn]ThresholdPercentage, and set the state) @memory_records.each{|k,v| + @@memory_last_sent_monitors.delete(k) #remove from last sent list if the record is present in the current set of signals calculate_monitor_state(v, @provider.get_config(MonitorId::CONTAINER_MEMORY_MONITOR_ID)) } @cpu_records.each{|k,v| + @@cpu_last_sent_monitors.delete(k) #remove from last sent list if the record is present in the current set of signals calculate_monitor_state(v, @provider.get_config(MonitorId::CONTAINER_CPU_MONITOR_ID)) } - @log.info "Finished computing state" end @@ -175,7 +178,6 @@ def get_records container_cpu_memory_records = [] @cpu_records.each{|resource_key, record| - cpu_limit_mc = 1.0 if record["limit"].is_a?(Numeric) cpu_limit_mc = record["limit"]/1000000.to_f @@ -221,6 +223,42 @@ def get_records container_cpu_memory_records.push(health_record) } + # If all records that were sent previously are present in current set, this will not be executed + if @@cpu_last_sent_monitors.keys.size != 0 + @@cpu_last_sent_monitors.keys.each{|key| + begin + @log.info "Container CPU monitor #{key} not present in current set. Sending none state transition" + tokens = key.split('_') + namespace = tokens[0] + workload_name = "#{tokens[0]}~~#{tokens[1]}" + container = tokens[2] + health_monitor_record = { + "timestamp" => time_now, + "state" => HealthMonitorStates::NONE, + "details" => { + "reason" => "No record received for workload #{workload_name}", + "workload_name" => workload_name, + "namespace" => namespace, + "container" => container + } + } + + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(MonitorId::CONTAINER_CPU_MONITOR_ID, key.split('_')) #container_cpu_utilization-namespace-workload-container + + health_record = {} + health_record[HealthMonitorRecordFields::MONITOR_ID] = MonitorId::CONTAINER_CPU_MONITOR_ID + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + container_cpu_memory_records.push(health_record) + rescue => e + @log.info "Error when trying to create NONE State transition signal for #{key} for monitor #{monitor_instance_id} #{e.message}" + next + end + } + end + @memory_records.each{|resource_key, record| health_monitor_record = { "timestamp" => time_now, @@ -245,6 +283,52 @@ def get_records health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now container_cpu_memory_records.push(health_record) } + + # If all records that were sent previously are present in current set, this will not be executed + if @@memory_last_sent_monitors.keys.size != 0 + @@memory_last_sent_monitors.keys.each{|key| + begin + @log.info "Container Memory monitor #{key} not present in current set. Sending none state transition" + tokens = key.split('_') + namespace = tokens[0] + workload_name = "#{tokens[0]}~~#{tokens[1]}" + container = tokens[2] + health_monitor_record = { + "timestamp" => time_now, + "state" => HealthMonitorStates::NONE, + "details" => { + "reason" => "No record received for workload #{workload_name}", + "workload_name" => workload_name, + "namespace" => namespace, + "container" => container + } + } + monitor_instance_id = HealthMonitorHelpers.get_monitor_instance_id(MonitorId::CONTAINER_MEMORY_MONITOR_ID, key.split('_')) #container_cpu_utilization-namespace-workload-container + health_record = {} + health_record[HealthMonitorRecordFields::MONITOR_ID] = MonitorId::CONTAINER_MEMORY_MONITOR_ID + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + container_cpu_memory_records.push(health_record) + rescue => e + @log.info "Error when trying to create NONE State transition signal for #{key} for monitor #{monitor_instance_id} #{e.message}" + next + end + } + end + + #reset the last sent monitors list + @@memory_last_sent_monitors = {} + @@cpu_last_sent_monitors = {} + + # add the current set of signals for comparison in next iteration + @cpu_records.keys.each{|k| + @@cpu_last_sent_monitors[k] = true + } + @memory_records.keys.each{|k| + @@memory_last_sent_monitors[k] = true + } return container_cpu_memory_records end @@ -298,4 +382,4 @@ def calculate_container_instance_state(counter_value, limit, config) end end end -end \ No newline at end of file +end From 2d861cccf20891f3150d325d3916f62883643126 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 12 Nov 2019 10:39:01 -0800 Subject: [PATCH 139/160] Rashmi/1 16 test (#297) * health deployment update * apps v1 changes for deployment * changes * changes to use relicasets and api groups --- source/code/plugin/KubernetesApiClient.rb | 21 +++--- source/code/plugin/in_kube_health.rb | 88 +++++++++++------------ 2 files changed, 55 insertions(+), 54 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index be1a51791..7b5a1cd24 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -12,6 +12,8 @@ class KubernetesApiClient require_relative "oms_common" @@ApiVersion = "v1" + @@ApiVersionApps = "v1" + @@ApiGroupApps = "apps" @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @@ClusterName = nil @@ClusterId = nil @@ -30,13 +32,12 @@ def initialize end class << self - def getKubeResourceInfo(resource, api_version: nil) + def getKubeResourceInfo(resource, api_group: nil) headers = {} response = nil - @Log.info "Getting Kube resource api_version #{api_version}" - @Log.info resource + @Log.info "Getting Kube resource: #{resource}" begin - resourceUri = getResourceUri(resource, api_version: api_version) + resourceUri = getResourceUri(resource, api_group) if !resourceUri.nil? uri = URI.parse(resourceUri) http = Net::HTTP.new(uri.host, uri.port) @@ -85,14 +86,14 @@ def getClusterRegion end end - def getResourceUri(resource, api_version: nil) + def getResourceUri(resource, api_group) begin if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] - if !api_version.nil? - return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + api_version + "/" + resource - end - api_version = @@ApiVersion - return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + api_version + "/" + resource + if api_group.nil? + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource + elsif api_group == @@ApiGroupApps + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/apps/" + @@ApiVersionApps + "/" + resource + end else @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") return nil diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 51ffa86d5..57ca07f64 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -7,12 +7,12 @@ require_relative "ApplicationInsightsUtility" module Fluent + Dir[File.join(__dir__, "./health", "*.rb")].each { |file| require file } - Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } class KubeHealthInput < Input Plugin.register_input("kubehealth", self) - config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' + config_param :health_monitor_config_path, :default => "/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json" @@clusterCpuCapacity = 0.0 @@clusterMemoryCapacity = 0.0 @@ -26,6 +26,7 @@ def initialize @@cluster_id = KubernetesApiClient.getClusterId @resources = HealthKubernetesResources.instance @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + @@ApiGroupApps = "apps" rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -40,25 +41,25 @@ def configure(conf) end def start - begin - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - - @@hmlog = HealthMonitorUtils.get_log_handle - @@clusterName = KubernetesApiClient.getClusterName - @@clusterRegion = KubernetesApiClient.getClusterRegion - cluster_capacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog) - @@clusterCpuCapacity = cluster_capacity[0] - @@clusterMemoryCapacity = cluster_capacity[1] - @@hmlog.info "Cluster CPU Capacity: #{@@clusterCpuCapacity} Memory Capacity: #{@@clusterMemoryCapacity}" - initialize_inventory - end - rescue => e - ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + begin + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + + @@hmlog = HealthMonitorUtils.get_log_handle + @@clusterName = KubernetesApiClient.getClusterName + @@clusterRegion = KubernetesApiClient.getClusterRegion + cluster_capacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog) + @@clusterCpuCapacity = cluster_capacity[0] + @@clusterMemoryCapacity = cluster_capacity[1] + @@hmlog.info "Cluster CPU Capacity: #{@@clusterCpuCapacity} Memory Capacity: #{@@clusterMemoryCapacity}" + initialize_inventory end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) + end end def shutdown @@ -73,7 +74,6 @@ def shutdown def enumerate begin - currentTime = Time.now emitTime = currentTime.to_f batchTime = currentTime.utc.iso8601 @@ -86,7 +86,7 @@ def enumerate node_inventory = JSON.parse(node_inventory_response.body) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") pod_inventory = JSON.parse(pod_inventory_response.body) - replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_version: "extensions/v1beta1").body) + replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_group: @@ApiGroupApps).body) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory @@ -108,8 +108,8 @@ def enumerate health_monitor_records.push(record) if record pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(@resources) - system_pods = pods_ready_hash.select{|k,v| v['namespace'] == 'kube-system'} - workload_pods = pods_ready_hash.select{|k,v| v['namespace'] != 'kube-system'} + system_pods = pods_ready_hash.select { |k, v| v["namespace"] == "kube-system" } + workload_pods = pods_ready_hash.select { |k, v| v["namespace"] != "kube-system" } system_pods_ready_percentage_records = process_pods_ready_percentage(system_pods, MonitorId::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) system_pods_ready_percentage_records.each do |record| @@ -147,13 +147,13 @@ def enumerate def process_cpu_oversubscribed_monitor(pod_inventory, node_inventory) timestamp = Time.now.utc.iso8601 @@clusterCpuCapacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog, node_inventory: node_inventory)[0] - subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"cpu", @@clusterCpuCapacity) + subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory, "cpu", @@clusterCpuCapacity) @@hmlog.info "Refreshed Cluster CPU Capacity #{@@clusterCpuCapacity}" - state = subscription > @@clusterCpuCapacity ? "fail" : "pass" + state = subscription > @@clusterCpuCapacity ? "fail" : "pass" #CPU monitor_id = MonitorId::WORKLOAD_CPU_OVERSUBSCRIBED_MONITOR_ID - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterCpuCapacity" => @@clusterCpuCapacity/1000000.to_f, "clusterCpuRequests" => subscription/1000000.to_f}} + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"clusterCpuCapacity" => @@clusterCpuCapacity / 1000000.to_f, "clusterCpuRequests" => subscription / 1000000.to_f}} # @@hmlog.info health_monitor_record monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@cluster_id]) @@ -163,8 +163,8 @@ def process_cpu_oversubscribed_monitor(pod_inventory, node_inventory) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now - health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id #@@hmlog.info "Successfully processed process_cpu_oversubscribed_monitor" return health_record @@ -172,10 +172,10 @@ def process_cpu_oversubscribed_monitor(pod_inventory, node_inventory) def process_memory_oversubscribed_monitor(pod_inventory, node_inventory) timestamp = Time.now.utc.iso8601 - @@clusterMemoryCapacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog,node_inventory: node_inventory)[1] + @@clusterMemoryCapacity = HealthMonitorUtils.get_cluster_cpu_memory_capacity(@@hmlog, node_inventory: node_inventory)[1] @@hmlog.info "Refreshed Cluster Memory Capacity #{@@clusterMemoryCapacity}" - subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory,"memory", @@clusterMemoryCapacity) - state = subscription > @@clusterMemoryCapacity ? "fail" : "pass" + subscription = HealthMonitorUtils.get_resource_subscription(pod_inventory, "memory", @@clusterMemoryCapacity) + state = subscription > @@clusterMemoryCapacity ? "fail" : "pass" #@@hmlog.debug "Memory Oversubscribed Monitor State : #{state}" #CPU @@ -189,8 +189,8 @@ def process_memory_oversubscribed_monitor(pod_inventory, node_inventory) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now - health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id #@@hmlog.info "Successfully processed process_memory_oversubscribed_monitor" return health_record @@ -201,7 +201,7 @@ def process_kube_api_up_monitor(state, response) monitor_id = MonitorId::KUBE_API_STATUS details = response.each_header.to_h - details['ResponseCode'] = response.code + details["ResponseCode"] = response.code health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => details} hmlog = HealthMonitorUtils.get_log_handle #hmlog.info health_monitor_record @@ -213,8 +213,8 @@ def process_kube_api_up_monitor(state, response) health_record[HealthMonitorRecordFields::MONITOR_ID] = monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now - health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id #@@hmlog.info "Successfully processed process_kube_api_up_monitor" return health_record @@ -227,10 +227,10 @@ def process_pods_ready_percentage(pods_hash, config_monitor_id) records = [] pods_hash.keys.each do |key| workload_name = key - total_pods = pods_hash[workload_name]['totalPods'] - pods_ready = pods_hash[workload_name]['podsReady'] - namespace = pods_hash[workload_name]['namespace'] - workload_kind = pods_hash[workload_name]['kind'] + total_pods = pods_hash[workload_name]["totalPods"] + pods_ready = pods_hash[workload_name]["podsReady"] + namespace = pods_hash[workload_name]["namespace"] + workload_kind = pods_hash[workload_name]["kind"] percent = pods_ready / total_pods * 100 timestamp = Time.now.utc.iso8601 @@ -242,8 +242,8 @@ def process_pods_ready_percentage(pods_hash, config_monitor_id) health_record[HealthMonitorRecordFields::MONITOR_ID] = config_monitor_id health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now - health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id records.push(health_record) end @@ -299,7 +299,7 @@ def initialize_inventory node_inventory = JSON.parse(node_inventory_response.body) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") pod_inventory = JSON.parse(pod_inventory_response.body) - replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_version: "extensions/v1beta1").body) + replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_group: @@ApiGroupApps).body) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory From 844afbdd4bb940902e0d90717a5af7d381c30c88 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 12 Nov 2019 15:16:48 -0800 Subject: [PATCH 140/160] Fix duplicate records in container memory/cpu samples (#298) --- source/code/plugin/filter_health_model_builder.rb | 6 +++--- .../plugin/health/health_container_cpu_memory_aggregator.rb | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index 47ce7a631..1724065fe 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -97,12 +97,11 @@ def filter_stream(tag, es) } end container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) - deduped_records = container_records_aggregator.dedupe_records(container_records) if @container_cpu_memory_records.nil? @log.info "@container_cpu_memory_records was not initialized" @container_cpu_memory_records = [] #in some clusters, this is null, so initialize it again. end - @container_cpu_memory_records.push(*deduped_records) # push the records for aggregation later + @container_cpu_memory_records.push(*container_records) # push the records for aggregation later return MultiEventStream.new elsif tag.start_with?("kubehealth.ReplicaSet") records = [] @@ -114,7 +113,8 @@ def filter_stream(tag, es) aggregated_container_records = [] if !@container_cpu_memory_records.nil? && !@container_cpu_memory_records.empty? container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) - container_records_aggregator.aggregate(@container_cpu_memory_records) + deduped_records = container_records_aggregator.dedupe_records(@container_cpu_memory_records) + container_records_aggregator.aggregate(deduped_records) container_records_aggregator.compute_state aggregated_container_records = container_records_aggregator.get_records end diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index 6d69e0213..29ac91bde 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -84,12 +84,13 @@ def dedupe_records(container_records) else r = resource_instances[instance_name] if record["Timestamp"] > r["Timestamp"] - @log.info "Dropping older record" + @log.info "Dropping older record for instance #{instance_name} new: #{record["Timestamp"]} old: #{r["Timestamp"]}" resource_instances[instance_name] = record end end rescue => e @log.info "Exception when deduping record #{record}" + next end end return cpu_deduped_instances.values.concat(memory_deduped_instances.values) From 9a8f0f8b58d28aee68cf680bebf8094c8e1b8ea6 Mon Sep 17 00:00:00 2001 From: bragi92 Date: Thu, 14 Nov 2019 10:42:50 -0800 Subject: [PATCH 141/160] Update MDM region list to include francecentral, japaneast and australiaeast --- installer/conf/container.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index f9540bde8..696ffdb6b 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -36,7 +36,7 @@ #custom_metrics_mdm filter plugin type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes log_level info From 597b2fb3dd9a4e9a7f4f4ec8cef3a855526abbe0 Mon Sep 17 00:00:00 2001 From: bragi92 Date: Thu, 14 Nov 2019 10:48:48 -0800 Subject: [PATCH 142/160] Update MDM region list to include francecentral, japaneast and australiaeast --- installer/conf/kube.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 40f4ac880..49d0bf62e 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -70,14 +70,14 @@ type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info @@ -247,4 +247,4 @@ retry_limit 10 retry_wait 30s max_retry_wait 9m - \ No newline at end of file + From cd1a37b72b1911eb657012668319763e0b3770da Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 14 Nov 2019 18:18:54 -0800 Subject: [PATCH 143/160] Send telemetry when there is error in calculation of state in percentage aggregation, and send state as unknown (#300) --- .../code/plugin/health/aggregate_monitor.rb | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/source/code/plugin/health/aggregate_monitor.rb b/source/code/plugin/health/aggregate_monitor.rb index 794f716ce..10dbdc705 100644 --- a/source/code/plugin/health/aggregate_monitor.rb +++ b/source/code/plugin/health/aggregate_monitor.rb @@ -3,6 +3,12 @@ require_relative 'health_model_constants' require 'json' +# Require only when running inside container. +# otherwise unit tests will fail due to ApplicationInsightsUtility dependency on base omsagent ruby files. If you have your dev machine starting with omsagent-rs, then GOOD LUCK! +if Socket.gethostname.start_with?('omsagent-rs') + require_relative '../ApplicationInsightsUtility' +end + module HealthModel class AggregateMonitor attr_accessor :monitor_id, :monitor_instance_id, :state, :transition_date_time, :aggregation_algorithm, :aggregation_algorithm_params, :labels, :is_aggregate_monitor, :details @@ -16,6 +22,8 @@ class AggregateMonitor MonitorState::NONE => 5 } + @@telemetry_sent_hash = {} + # constructor def initialize( monitor_id, @@ -127,17 +135,43 @@ def calculate_percentage_state(monitor_set) #sort #TODO: What if sorted_filtered is empty? is that even possible? + log = HealthMonitorHelpers.get_log_handle sorted_filtered = sort_filter_member_monitors(monitor_set) state_threshold = @aggregation_algorithm_params['state_threshold'].to_f - size = sorted_filtered.size + if sorted_filtered.nil? + size = 0 + else + size = sorted_filtered.size + end + if size == 1 @state = sorted_filtered[0].state else count = ((state_threshold*size)/100).ceil index = size - count - @state = sorted_filtered[index].state + if sorted_filtered.nil? || sorted_filtered[index].nil? + @state = HealthMonitorStates::UNKNOWN + if !@@telemetry_sent_hash.key?(@monitor_instance_id) + log.debug "Adding to telemetry sent hash #{@monitor_instance_id}" + @@telemetry_sent_hash[@monitor_instance_id] = true + log.info "Index: #{index} size: #{size} Count: #{count}" + custom_error_event_map = {} + custom_error_event_map["count"] = count + custom_error_event_map["index"] = index + custom_error_event_map["size"] = size + if !sorted_filtered.nil? + sorted_filtered.each_index{|i| + custom_error_event_map[i] = sorted_filtered[i].state + } + end + ApplicationInsightsUtility.sendCustomEvent("PercentageStateCalculationErrorEvent", custom_error_event_map) + end + else + @state = sorted_filtered[index].state + end + @state end end From d6ea1896ae3f63307434fc9e37315f2c16db37d0 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 25 Nov 2019 17:26:30 -0800 Subject: [PATCH 144/160] fix exceptions (#306) --- source/code/plugin/KubernetesApiClient.rb | 6 ++++-- source/code/plugin/in_kube_podinventory.rb | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 7b5a1cd24..6bfdc06f1 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -58,8 +58,10 @@ def getKubeResourceInfo(resource, api_group: nil) rescue => error @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") end - if (!response.nil? && !response.body.nil? && response.body.empty?) - @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") + if (!response.nil?) + if (!response.body.nil? && response.body.empty?) + @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") + end end return response end diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 766831a66..1dd029b22 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -58,10 +58,16 @@ def enumerate(podList = nil) end begin - if (!podInventory.empty? && podInventory.key?("items") && !podInventory["items"].empty?) + if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].empty?) #get pod inventory & services $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + serviceList = nil + serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") + + if !serviceInfo.nil? + serviceList = JSON.parse(serviceInfo.body) + end + $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(podInventory, serviceList) else From 3df0ab6567e4c39b686cfff31b5baf18013929cf Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 3 Dec 2019 16:20:40 -0800 Subject: [PATCH 145/160] Merge Branch morgan into ci_feature (#308) * Fixes : 1) Disable health (for time being) - in DS & RS 2) Disable MDM (for time being) - in DS & RS 3) Merge kubeperf into kubenode & kubepod 4) Made scheduling predictable for kubenode & kubepod 5) Enable containerlog enrichment fields (timeofcommand, containername & containerimage) as a configurable setting (default = true/ON) - Also add telemetry for it 6) Filter OUT type!=Normal events for k8s events 7) AppInsights telemetry async 8) Fix double calling bug in in_win_cadvisor_perf 9) Add connect timeout (20secs) & read timeout (40 secs) for all cadvisor api calls & also for all kubernetes api server calls 10) Fix batchTime for kubepods to be one before making api server call (rather than after making the call, which will make it fluctuate based on api server latency for the call) * fix setting issue for the new enrichcontainerlog setting * fix compilation issue * fix another compilation issue * fix emit issues * fix a nil issue * fix mising tag * * Fix all input plugins for scheduling issue * Merge kubeservices with kubepodinventory (reduce RS to API server by one more) * Remove Kubelogs (not used) * Fix liveness probe * Disable enrichment by default for container logs * Move to yajl json parser across the board for docker provier code * Remove unused files * fix removed files * fix timeofcommand and remove a duplicate entry for a health file. * Rashmi/http leak fixes (#301) * changes for http connection close * close socket in ensure * adding nil check * Rashmi/http leak fixes (#303) * changes for http connection close * close socket in ensure * adding nil check * adding missing end * use yajl for events & nodes parsing. * Rashmi/http leak fixes (#304) * changes for http connection close * close socket in ensure * adding nil check * Update MDM region list to include francecentral, japaneast and australiaeast * Update MDM region list to include francecentral, japaneast and australiaeast * adding missing end * Send telemetry when there is error in calculation of state in percentage aggregation, and send state as unknown (#300) * changes for chunking * telemetry changes * some fixes * bug fix * changing to have morgan changes only * add new line * use polltime for metrics and disable out_forward for health * enable mdm & health * few optimizations * do not remove time of command make kube.conf same as scale tested config * remove comments from container.conf * remove flush comment for ai telemetry * remove commented code lines * fix config * remove timeofcommand when enrichment==false * fix config * enable mdm filter * Rashmi/api chunk (#307) * changes * changes * refactor changes * changes * changes * changes * changes * node changes * changes * changes * changes * changes * adding open and read timeouts for api client * removing comments * updating chunk size --- installer/conf/container.conf | 33 +- installer/conf/kube.conf | 466 ++++++++---------- installer/datafiles/base_container.data | 5 - installer/scripts/livenessprobe.sh | 2 +- installer/scripts/tomlparser.rb | 12 + source/code/go/src/plugins/oms.go | 52 +- .../code/plugin/ApplicationInsightsUtility.rb | 44 +- .../code/plugin/CAdvisorMetricsAPIClient.rb | 66 +-- source/code/plugin/ContainerInventoryState.rb | 2 +- source/code/plugin/DockerApiClient.rb | 7 +- source/code/plugin/KubernetesApiClient.rb | 51 +- source/code/plugin/filter_cadvisor2mdm.rb | 2 +- .../filter_cadvisor_health_container.rb | 2 +- .../plugin/filter_cadvisor_health_node.rb | 2 +- source/code/plugin/filter_docker_log.rb | 1 + .../plugin/filter_health_model_builder.rb | 2 +- source/code/plugin/filter_inventory2mdm.rb | 2 +- .../code/plugin/health/aggregate_monitor.rb | 4 +- .../plugin/health/cluster_health_state.rb | 1 + .../health_container_cpu_memory_aggregator.rb | 6 +- ...h_container_cpu_memory_record_formatter.rb | 2 + .../plugin/health/health_hierarchy_builder.rb | 2 +- .../health/health_model_definition_parser.rb | 2 +- .../plugin/health/health_monitor_optimizer.rb | 1 + .../plugin/health/health_monitor_provider.rb | 1 + .../plugin/health/health_monitor_state.rb | 1 + .../plugin/health/health_monitor_utils.rb | 1 + source/code/plugin/health/unit_monitor.rb | 2 +- source/code/plugin/in_cadvisor_perf.rb | 26 +- source/code/plugin/in_containerinventory.rb | 20 +- .../code/plugin/in_containerlog_sudo_tail.rb | 189 ------- source/code/plugin/in_kube_events.rb | 150 +++--- source/code/plugin/in_kube_health.rb | 88 ++-- source/code/plugin/in_kube_logs.rb | 181 ------- source/code/plugin/in_kube_nodes.rb | 307 +++++++----- source/code/plugin/in_kube_perf.rb | 120 ----- source/code/plugin/in_kube_podinventory.rb | 220 ++++++--- source/code/plugin/in_kube_services.rb | 110 ----- source/code/plugin/in_win_cadvisor_perf.rb | 24 +- .../channel/contracts/json_serializable.rb | 2 +- .../channel/sender_base.rb | 2 +- source/code/plugin/out_mdm.rb | 2 +- 42 files changed, 967 insertions(+), 1248 deletions(-) mode change 100755 => 100644 installer/conf/container.conf delete mode 100644 source/code/plugin/in_containerlog_sudo_tail.rb delete mode 100644 source/code/plugin/in_kube_logs.rb delete mode 100644 source/code/plugin/in_kube_perf.rb delete mode 100644 source/code/plugin/in_kube_services.rb diff --git a/installer/conf/container.conf b/installer/conf/container.conf old mode 100755 new mode 100644 index 696ffdb6b..93c250fbb --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -11,7 +11,7 @@ type containerinventory tag oms.containerinsights.containerinventory - run_interval 60s + run_interval 60 log_level debug @@ -19,7 +19,7 @@ type cadvisorperf tag oms.api.cadvisorperf - run_interval 60s + run_interval 60 log_level debug @@ -45,30 +45,28 @@ type out_oms log_level debug num_threads 5 - buffer_chunk_limit 20m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer - buffer_queue_limit 20 buffer_queue_full_action drop_oldest_chunk + buffer_chunk_limit 4m flush_interval 20s retry_limit 10 - retry_wait 30s - max_retry_wait 9m + retry_wait 5s + max_retry_wait 5m type out_oms log_level debug num_threads 5 - buffer_chunk_limit 20m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_cadvisorperf*.buffer - buffer_queue_limit 20 buffer_queue_full_action drop_oldest_chunk + buffer_chunk_limit 4m flush_interval 20s retry_limit 10 - retry_wait 30s - max_retry_wait 9m + retry_wait 5s + max_retry_wait 5m @@ -80,6 +78,14 @@ heartbeat_type tcp skip_network_error_at_init true expire_dns_cache 600s + buffer_queue_full_action drop_oldest_chunk + buffer_type file + buffer_path %STATE_DIR_WS%/out_health_forward*.buffer + buffer_chunk_limit 3m + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}" @@ -96,14 +102,13 @@ type out_mdm log_level debug num_threads 5 - buffer_chunk_limit 20m buffer_type file buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_limit 20 buffer_queue_full_action drop_oldest_chunk + buffer_chunk_limit 4m flush_interval 20s retry_limit 10 - retry_wait 30s - max_retry_wait 9m + retry_wait 5s + max_retry_wait 5m retry_mdm_post_wait_minutes 60 diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 49d0bf62e..207780442 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -1,250 +1,218 @@ # Fluentd config file for OMS Docker - cluster components (kubeAPI) - - type forward - port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" - bind 0.0.0.0 - - -#Kubernetes pod inventory - - type kubepodinventory - tag oms.containerinsights.KubePodInventory - run_interval 60s - log_level debug - - -#Kubernetes events - - type kubeevents - tag oms.containerinsights.KubeEvents - run_interval 60s - log_level debug - - -#Kubernetes logs - - type kubelogs - tag oms.api.KubeLogs - run_interval 60s - - -#Kubernetes services - - type kubeservices - tag oms.containerinsights.KubeServices - run_interval 60s - log_level debug - - -#Kubernetes Nodes - - type kubenodeinventory - tag oms.containerinsights.KubeNodeInventory - run_interval 60s - log_level debug - - -#Kubernetes perf - - type kubeperf - tag oms.api.KubePerf - run_interval 60s - log_level debug - - -#Kubernetes health - - type kubehealth - tag kubehealth.ReplicaSet - run_interval 60s - log_level debug - - -#cadvisor perf- Windows nodes - - type wincadvisorperf - tag oms.api.wincadvisorperf - run_interval 60s - log_level debug - - - - type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - log_level info - - -#custom_metrics_mdm filter plugin for perf data from windows nodes - - type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes - log_level info - - - - type filter_health_model_builder - - - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - retry_mdm_post_wait_minutes 60 - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 5m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer - buffer_queue_limit 10 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms_api - log_level debug - buffer_chunk_limit 10m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer - buffer_queue_limit 10 - flush_interval 20s - retry_limit 10 - retry_wait 30s - - - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - retry_mdm_post_wait_minutes 60 - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubehealth*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - + #fluent forward plugin + + type forward + port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" + bind 0.0.0.0 + chunk_size_limit 4m + + + #Kubernetes pod inventory + + type kubepodinventory + tag oms.containerinsights.KubePodInventory + run_interval 60 + log_level debug + + + #Kubernetes events + + type kubeevents + tag oms.containerinsights.KubeEvents + run_interval 60 + log_level debug + + + #Kubernetes Nodes + + type kubenodeinventory + tag oms.containerinsights.KubeNodeInventory + run_interval 60 + log_level debug + + + #Kubernetes health + + type kubehealth + tag kubehealth.ReplicaSet + run_interval 60 + log_level debug + + + #cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60 + log_level debug + + + + type filter_inventory2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + log_level info + + + #custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + + #health model aggregation filter + + type filter_health_model_builder + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 2 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 3 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + retry_mdm_post_wait_minutes 60 + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + retry_mdm_post_wait_minutes 60 + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubehealth*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 4ebc4f338..60de5af18 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -26,16 +26,13 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/in_kube_podinventory.rb; source/code/plugin/in_kube_podinventory.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_events.rb; source/code/plugin/in_kube_events.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_logs.rb; source/code/plugin/in_kube_logs.rb; 644; root; root /opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/code/plugin/KubernetesApiClient.rb; 644; root; root /etc/opt/microsoft/docker-cimprov/container.conf; installer/conf/container.conf; 644; root; root /opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/code/plugin/CAdvisorMetricsAPIClient.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_perf.rb; source/code/plugin/in_kube_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/code/plugin/in_cadvisor_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/code/plugin/in_win_cadvisor_perf.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_services.rb; source/code/plugin/in_kube_services.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root /opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/code/plugin/CustomMetricsUtils.rb; 644; root; root @@ -143,12 +140,10 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/health/health_model_definition_parser.rb; source/code/plugin/health/health_model_definition_parser.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_optimizer.rb; source/code/plugin/health/health_monitor_optimizer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_provider.rb; source/code/plugin/health/health_monitor_provider.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_record.rb; source/code/plugin/health/health_monitor_record.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_state.rb; source/code/plugin/health/health_monitor_state.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_telemetry.rb; source/code/plugin/health/health_monitor_telemetry.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/code/plugin/health/health_monitor_helpers.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_monitor_utils.rb; source/code/plugin/health/health_monitor_utils.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/health_signal_reducer.rb; source/code/plugin/health/health_signal_reducer.rb; 644; root; root /opt/microsoft/omsagent/plugin/health/monitor_factory.rb; source/code/plugin/health/monitor_factory.rb; 644; root; root diff --git a/installer/scripts/livenessprobe.sh b/installer/scripts/livenessprobe.sh index cb7e8a0ba..e957b4bdf 100644 --- a/installer/scripts/livenessprobe.sh +++ b/installer/scripts/livenessprobe.sh @@ -1,7 +1,7 @@ #!/bin/bash #test to exit non zero value -(ps -ef | grep omsagent | grep -v "grep") && (ps -ef | grep td-agent-bit | grep -v "grep") +(ps -ef | grep omsagent- | grep -v "grep") && (ps -ef | grep td-agent-bit | grep -v "grep") if [ $? -eq 0 ] && [ ! -s "inotifyoutput.txt" ] then # inotifyoutput file is empty and the grep commands for omsagent and td-agent-bit succeeded diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index cd16cbf9b..ba67d023a 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -15,6 +15,7 @@ @logTailPath = "/var/log/containers/*.log" @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" @excludePath = "*.csv2" #some invalid path +@enrichContainerLogs = false # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -117,6 +118,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level environment variable collection - #{errorStr}, using defaults, please check config map for errors") end + + #Get container log enrichment setting + begin + if !parsedConfig[:log_collection_settings][:enrich_container_logs].nil? && !parsedConfig[:log_collection_settings][:enrich_container_logs][:enabled].nil? + @enrichContainerLogs = parsedConfig[:log_collection_settings][:enrich_container_logs][:enabled] + puts "config::Using config map setting for cluster level container log enrichment" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level container log enrichment - #{errorStr}, using defaults, please check config map for errors") + end end end @@ -156,6 +167,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_STDERR_EXCLUDED_NAMESPACES=#{@stderrExcludeNamespaces}\n") file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") + file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 5a323d7e0..834726c93 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -92,6 +92,8 @@ var ( ResourceName string //KubeMonAgentEvents skip first flush skipKubeMonEventsFlush bool + // enrich container logs (when true this will add the fields - timeofcommand, containername & containerimage) + enrichContainerLogs bool ) var ( @@ -746,16 +748,30 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap["Name"] = val } - dataItem := DataItem{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - LogEntryTimeOfCommand: start.Format(time.RFC3339), - SourceSystem: stringMap["SourceSystem"], - Computer: Computer, - Image: stringMap["Image"], - Name: stringMap["Name"], + var dataItem DataItem + if enrichContainerLogs == true { + dataItem = DataItem{ + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + LogEntryTimeOfCommand: start.Format(time.RFC3339), + SourceSystem: stringMap["SourceSystem"], + Computer: Computer, + Image: stringMap["Image"], + Name: stringMap["Name"], + } + } else { // dont collect timeofcommand field as its part of container log enrivhment + dataItem = DataItem{ + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + SourceSystem: stringMap["SourceSystem"], + Computer: Computer, + Image: stringMap["Image"], + Name: stringMap["Name"], + } } FlushedRecordsSize += float64(len(stringMap["LogEntry"])) @@ -892,6 +908,15 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { // Initilizing this to true to skip the first kubemonagentevent flush since the errors are not populated at this time skipKubeMonEventsFlush = true + enrichContainerLogsSetting := os.Getenv("AZMON_CLUSTER_CONTAINER_LOG_ENRICH") + if (strings.Compare(enrichContainerLogsSetting, "true") == 0) { + enrichContainerLogs = true + Log("ContainerLogEnrichment=true \n") + } else { + enrichContainerLogs = false + Log("ContainerLogEnrichment=false \n") + } + pluginConfig, err := ReadConfiguration(pluginConfPath) if err != nil { message := fmt.Sprintf("Error Reading plugin config path : %s \n", err.Error()) @@ -989,7 +1014,12 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() - go updateContainerImageNameMaps() + if enrichContainerLogs == true { + Log("ContainerLogEnrichment=true; starting goroutine to update containerimagenamemaps \n") + go updateContainerImageNameMaps() + } else { + Log("ContainerLogEnrichment=false \n") + } // Flush config error records every hour go flushKubeMonAgentEventRecords() diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 85b424e69..f7bd806a0 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -6,7 +6,7 @@ class ApplicationInsightsUtility require_relative "omslog" require_relative "DockerApiClient" require_relative "oms_common" - require "json" + require 'yajl/json_gem' require "base64" @@HeartBeat = "HeartBeatEvent" @@ -73,16 +73,37 @@ def initializeUtility() @@Tc = ApplicationInsights::TelemetryClient.new elsif !encodedAppInsightsKey.nil? decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) + #override ai endpoint if its available otherwise use default. if appInsightsEndpoint && !appInsightsEndpoint.nil? && !appInsightsEndpoint.empty? $log.info("AppInsightsUtility: Telemetry client uses overrided endpoint url : #{appInsightsEndpoint}") - telemetrySynchronousSender = ApplicationInsights::Channel::SynchronousSender.new appInsightsEndpoint - telemetrySynchronousQueue = ApplicationInsights::Channel::SynchronousQueue.new(telemetrySynchronousSender) - telemetryChannel = ApplicationInsights::Channel::TelemetryChannel.new nil, telemetrySynchronousQueue + #telemetrySynchronousSender = ApplicationInsights::Channel::SynchronousSender.new appInsightsEndpoint + #telemetrySynchronousQueue = ApplicationInsights::Channel::SynchronousQueue.new(telemetrySynchronousSender) + #telemetryChannel = ApplicationInsights::Channel::TelemetryChannel.new nil, telemetrySynchronousQueue + sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint + queue = ApplicationInsights::Channel::AsynchronousQueue.new sender + channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey, telemetryChannel else - @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey + sender = ApplicationInsights::Channel::AsynchronousSender.new + queue = ApplicationInsights::Channel::AsynchronousQueue.new sender + channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue + @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey, channel end + # The below are default recommended values. If you change these, ensure you test telemetry flow fully + + # flush telemetry if we have 10 or more telemetry items in our queue + #@@Tc.channel.queue.max_queue_length = 10 + + # send telemetry to the service in batches of 5 + #@@Tc.channel.sender.send_buffer_size = 5 + + # the background worker thread will be active for 5 seconds before it shuts down. if + # during this time items are picked up from the queue, the timer is reset. + #@@Tc.channel.sender.send_time = 5 + + # the background worker thread will poll the queue every 0.5 seconds for new items + #@@Tc.channel.sender.send_interval = 0.5 end rescue => errorStr $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") @@ -102,8 +123,7 @@ def sendHeartBeatEvent(pluginName) eventName = pluginName + @@HeartBeat if !(@@Tc.nil?) @@Tc.track_event eventName, :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Heartbeat Telemetry sent successfully") + $log.info("AppInsights Heartbeat Telemetry put successfully into the queue") end rescue => errorStr $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") @@ -116,8 +136,7 @@ def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) @@Tc.track_metric "LastProcessedContainerInventoryCount", properties["ContainerCount"], :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Container Count Telemetry sent successfully") + $log.info("AppInsights Container Count Telemetry sput successfully into the queue") end rescue => errorStr $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") @@ -138,7 +157,6 @@ def sendCustomEvent(eventName, properties) end if !(@@Tc.nil?) @@Tc.track_event eventName, :properties => telemetryProps - @@Tc.flush $log.info("AppInsights Custom Event #{eventName} sent successfully") end rescue => errorStr @@ -162,8 +180,7 @@ def sendExceptionTelemetry(errorStr, properties = nil) end if !(@@Tc.nil?) @@Tc.track_exception errorStr, :properties => telemetryProps - @@Tc.flush - $log.info("AppInsights Exception Telemetry sent successfully") + $log.info("AppInsights Exception Telemetry put successfully into the queue") end rescue => errorStr $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") @@ -209,8 +226,7 @@ def sendMetricTelemetry(metricName, metricValue, properties) @@Tc.track_metric metricName, metricValue, :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, :properties => telemetryProps - @@Tc.flush - $log.info("AppInsights metric Telemetry #{metricName} sent successfully") + $log.info("AppInsights metric Telemetry #{metricName} put successfully into the queue") end rescue => errorStr $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 09499b4cf..be61b8b8f 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -2,12 +2,13 @@ # frozen_string_literal: true class CAdvisorMetricsAPIClient - require "json" + require 'yajl/json_gem' require "logger" require "net/http" require "net/https" require "uri" require "date" + require "time" require_relative "oms_common" require_relative "KubernetesApiClient" @@ -21,6 +22,7 @@ class CAdvisorMetricsAPIClient @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] + @clusterContainerLogEnrich = ENV["AZMON_CLUSTER_CONTAINER_LOG_ENRICH"] @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] @@ -64,12 +66,11 @@ def getSummaryStatsFromCAdvisor(winNode) cAdvisorUri = getCAdvisorUri(winNode) if !cAdvisorUri.nil? uri = URI.parse(cAdvisorUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = false - - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" + Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40 ) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end end rescue => error @Log.warn("CAdvisor api request failed: #{error}") @@ -103,7 +104,7 @@ def getCAdvisorUri(winNode) end end - def getMetrics(winNode = nil) + def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601 ) metricDataItems = [] begin cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) @@ -122,27 +123,27 @@ def getMetrics(winNode = nil) operatingSystem = "Linux" end if !metricInfo.nil? - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes", metricTime)) + metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch", metricTime)) if operatingSystem == "Linux" - metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) + metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores", metricTime)) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes", metricTime)) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes", metricTime)) elsif operatingSystem == "Windows" - containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", "cpuUsageNanoCores") + containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", "cpuUsageNanoCores", metricTime) if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? metricDataItems.concat(containerCpuUsageNanoSecondsRate) end end - cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores", operatingSystem) + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores", operatingSystem, metricTime) if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? metricDataItems.push(cpuUsageNanoSecondsRate) end - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes", metricTime)) - metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) + metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch", metricTime)) # Disabling networkRxRate and networkTxRate since we dont use it as of now. #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) @@ -165,7 +166,7 @@ def getMetrics(winNode = nil) return metricDataItems end - def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs @@ -182,7 +183,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met #cpu metric containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] - metricTime = container["cpu"]["time"] + metricTime = metricPollTime #container["cpu"]["time"] metricItem = {} metricItem["DataItems"] = [] @@ -219,6 +220,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["clusterlogtailexcludepath"] = @clusterLogTailExcludPath telemetryProps["clusterLogTailPath"] = @clusterLogTailPath telemetryProps["clusterAgentSchemaVersion"] = @clusterAgentSchemaVersion + telemetryProps["clusterCLEnrich"] = @clusterContainerLogEnrich end #telemetry about prometheus metric collections settings for daemonset if (File.file?(@promConfigMountPath)) @@ -272,7 +274,7 @@ def resetWinContainerIdCache end # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds - def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs @@ -292,7 +294,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, containerCount += 1 containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] - metricTime = container["cpu"]["time"] + metricTime = metricPollTime #container["cpu"]["time"] metricItem = {} metricItem["DataItems"] = [] @@ -366,7 +368,7 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, return metricItems end - def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) + def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs @@ -381,7 +383,7 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec pod["containers"].each do |container| containerName = container["name"] metricValue = container["memory"][memoryMetricNameToCollect] - metricTime = container["memory"]["time"] + metricTime = metricPollTime #container["memory"]["time"] metricItem = {} metricItem["DataItems"] = [] @@ -431,7 +433,7 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec return metricItems end - def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) + def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn, metricPollTime) metricItem = {} clusterId = KubernetesApiClient.getClusterId begin @@ -441,7 +443,7 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, if !node[metricCategory].nil? metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]["time"] + metricTime = metricPollTime #node[metricCategory]["time"] metricItem["DataItems"] = [] @@ -467,7 +469,7 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, return metricItem end - def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn, operatingSystem) + def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn, operatingSystem, metricPollTime) metricItem = {} clusterId = KubernetesApiClient.getClusterId begin @@ -477,7 +479,7 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl if !node[metricCategory].nil? metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]["time"] + metricTime = metricPollTime #node[metricCategory]["time"] # if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds") # @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") @@ -584,7 +586,7 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl return metricItem end - def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) + def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn, metricPollTime) metricItem = {} clusterId = KubernetesApiClient.getClusterId @@ -594,7 +596,7 @@ def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) nodeName = node["nodeName"] metricValue = node["startTime"] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + metricTime = metricPollTime #Time.now.utc.iso8601 #2018-01-30T19:36:14Z metricItem["DataItems"] = [] @@ -620,10 +622,10 @@ def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) return metricItem end - def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) + def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId - currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + #currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z begin metricInfo = metricJSON metricInfo["pods"].each do |pod| @@ -632,7 +634,7 @@ def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) pod["containers"].each do |container| containerName = container["name"] metricValue = container["startTime"] - metricTime = currentTime + metricTime = metricPollTime #currentTime metricItem = {} metricItem["DataItems"] = [] diff --git a/source/code/plugin/ContainerInventoryState.rb b/source/code/plugin/ContainerInventoryState.rb index 7e5ca18e8..170fa65e3 100644 --- a/source/code/plugin/ContainerInventoryState.rb +++ b/source/code/plugin/ContainerInventoryState.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true class ContainerInventoryState - require 'json' + require 'yajl/json_gem' require_relative 'omslog' @@InventoryDirectory = "/var/opt/microsoft/docker-cimprov/state/ContainerInventory/" diff --git a/source/code/plugin/DockerApiClient.rb b/source/code/plugin/DockerApiClient.rb index ee2742dd4..f2828b357 100644 --- a/source/code/plugin/DockerApiClient.rb +++ b/source/code/plugin/DockerApiClient.rb @@ -3,7 +3,7 @@ class DockerApiClient require "socket" - require "json" + require "yajl/json_gem" require "timeout" require_relative "omslog" require_relative "DockerApiRestHelper" @@ -40,7 +40,6 @@ def getResponse(request, isMultiJson, isVersion) end break if (isVersion) ? (responseChunk.length < @@ChunkSize) : (responseChunk.end_with? "0\r\n\r\n") end - socket.close return (isTimeOut) ? nil : parseResponse(dockerResponse, isMultiJson) rescue => errorStr $log.warn("Socket call failed for request: #{request} error: #{errorStr} , isMultiJson: #{isMultiJson} @ #{Time.now.utc.iso8601}") @@ -49,6 +48,10 @@ def getResponse(request, isMultiJson, isVersion) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return nil + ensure + if !socket.nil? + socket.close + end end end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 6bfdc06f1..e52c77884 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true class KubernetesApiClient - require "json" + require "yajl/json_gem" require "logger" require "net/http" require "net/https" @@ -40,20 +40,17 @@ def getKubeResourceInfo(resource, api_group: nil) resourceUri = getResourceUri(resource, api_group) if !resourceUri.nil? uri = URI.parse(resourceUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = true if !File.exist?(@@CaFile) raise "#{@@CaFile} doesnt exist" else - http.ca_file = @@CaFile if File.exist?(@@CaFile) + Net::HTTP.start(uri.host, uri.port, :use_ssl => true, :ca_file => @@CaFile, :verify_mode => OpenSSL::SSL::VERIFY_PEER, :open_timeout => 20, :read_timeout => 40) do |http| + kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) + kubeApiRequest["Authorization"] = "Bearer " + getTokenStr + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" + response = http.request(kubeApiRequest) + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" + end end - http.verify_mode = OpenSSL::SSL::VERIFY_PEER - - kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) - kubeApiRequest["Authorization"] = "Bearer " + getTokenStr - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" - response = http.request(kubeApiRequest) - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" end rescue => error @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") @@ -338,7 +335,7 @@ def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp) return containerLogs end - def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId @@ -373,7 +370,7 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName nodeName = pod["spec"]["nodeName"] podContainers.each do |container| containerName = container["name"] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) @@ -433,14 +430,14 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName return metricItems end #getContainerResourceRequestAndLimits - def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin metricInfo = metricJSON clusterId = getClusterId #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, #if we are coming up with the time it should be same for all nodes - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z metricInfo["items"].each do |node| if (!node["status"][metricCategory].nil?) @@ -551,5 +548,29 @@ def getMetricNumericValue(metricName, metricVal) end return metricValue end # getMetricNumericValue + + def getResourcesAndContinuationToken(uri) + continuationToken = nil + resourceInventory = nil + begin + @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" + resourceInfo = getKubeResourceInfo(uri) + @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" + if !resourceInfo.nil? + @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" + resourceInventory = Yajl::Parser.parse(StringIO.new(resourceInfo.body)) + @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:End:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" + resourceInfo = nil + end + if (!resourceInventory.nil? && !resourceInventory["metadata"].nil?) + continuationToken = resourceInventory["metadata"]["continue"] + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getResourcesAndContinuationToken:Failed in get resources for #{uri} and continuation token: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + resourceInventory = nil + end + return continuationToken, resourceInventory + end #getResourcesAndContinuationToken end end diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index a6e643e45..f14a1369b 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -4,7 +4,7 @@ module Fluent require 'logger' - require 'json' + require 'yajl/json_gem' require_relative 'oms_common' require_relative 'CustomMetricsUtils' diff --git a/source/code/plugin/filter_cadvisor_health_container.rb b/source/code/plugin/filter_cadvisor_health_container.rb index 2eccd125f..93d50e20f 100644 --- a/source/code/plugin/filter_cadvisor_health_container.rb +++ b/source/code/plugin/filter_cadvisor_health_container.rb @@ -3,7 +3,7 @@ module Fluent require 'logger' - require 'json' + require 'yajl/json_gem' require_relative 'oms_common' require_relative "ApplicationInsightsUtility" Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb index d2f735cd1..c6280db60 100644 --- a/source/code/plugin/filter_cadvisor_health_node.rb +++ b/source/code/plugin/filter_cadvisor_health_node.rb @@ -3,7 +3,7 @@ module Fluent require 'logger' - require 'json' + require 'yajl/json_gem' require_relative 'oms_common' require_relative "ApplicationInsightsUtility" require_relative "KubernetesApiClient" diff --git a/source/code/plugin/filter_docker_log.rb b/source/code/plugin/filter_docker_log.rb index 7ffd333e3..b80f4c204 100644 --- a/source/code/plugin/filter_docker_log.rb +++ b/source/code/plugin/filter_docker_log.rb @@ -5,6 +5,7 @@ module Fluent require 'logger' require 'socket' + require 'yajl/json_gem' class DockerLogFilter < Filter Plugin.register_filter('filter_docker_log', self) diff --git a/source/code/plugin/filter_health_model_builder.rb b/source/code/plugin/filter_health_model_builder.rb index 1724065fe..1c451ea38 100644 --- a/source/code/plugin/filter_health_model_builder.rb +++ b/source/code/plugin/filter_health_model_builder.rb @@ -4,7 +4,7 @@ module Fluent require 'logger' - require 'json' + require 'yajl/json_gem' Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb index 30f6f911a..422b4b54a 100644 --- a/source/code/plugin/filter_inventory2mdm.rb +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -4,7 +4,7 @@ module Fluent require 'logger' - require 'json' + require 'yajl/json_gem' require_relative 'oms_common' require_relative 'CustomMetricsUtils' diff --git a/source/code/plugin/health/aggregate_monitor.rb b/source/code/plugin/health/aggregate_monitor.rb index 10dbdc705..a774478e7 100644 --- a/source/code/plugin/health/aggregate_monitor.rb +++ b/source/code/plugin/health/aggregate_monitor.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true require_relative 'health_model_constants' -require 'json' +require 'yajl/json_gem' # Require only when running inside container. # otherwise unit tests will fail due to ApplicationInsightsUtility dependency on base omsagent ruby files. If you have your dev machine starting with omsagent-rs, then GOOD LUCK! @@ -218,7 +218,7 @@ def sort_filter_member_monitors(monitor_set) member_monitors.push(member_monitor) } - filtered = member_monitors.select{|monitor| monitor.state != MonitorState::NONE} + filtered = member_monitors.keep_if{|monitor| monitor.state != MonitorState::NONE} sorted = filtered.sort_by{ |monitor| [@@sort_key_order[monitor.state]] } return sorted diff --git a/source/code/plugin/health/cluster_health_state.rb b/source/code/plugin/health/cluster_health_state.rb index fa9cb42b2..e46d0bf5f 100644 --- a/source/code/plugin/health/cluster_health_state.rb +++ b/source/code/plugin/health/cluster_health_state.rb @@ -3,6 +3,7 @@ require "net/http" require "net/https" require "uri" +require 'yajl/json_gem' module HealthModel class ClusterHealthState diff --git a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb index 29ac91bde..e93c66c14 100644 --- a/source/code/plugin/health/health_container_cpu_memory_aggregator.rb +++ b/source/code/plugin/health/health_container_cpu_memory_aggregator.rb @@ -64,8 +64,8 @@ def initialize(resources, provider) def dedupe_records(container_records) cpu_deduped_instances = {} memory_deduped_instances = {} - container_records = container_records.select{|record| record['CounterName'] == @@memory_counter_name || record['CounterName'] == @@cpu_counter_name} - + container_records = container_records.keep_if{|record| record['CounterName'] == @@memory_counter_name || record['CounterName'] == @@cpu_counter_name} + container_records.each do |record| begin instance_name = record["InstanceName"] @@ -98,7 +98,7 @@ def dedupe_records(container_records) def aggregate(container_records) #filter and select only cpuUsageNanoCores and memoryRssBytes - container_records = container_records.select{|record| record['CounterName'] == @@memory_counter_name || record['CounterName'] == @@cpu_counter_name} + container_records = container_records.keep_if{|record| record['CounterName'] == @@memory_counter_name || record['CounterName'] == @@cpu_counter_name} # poduid lookup has poduid/cname --> workload_name, namespace, cpu_limit, memory limit mapping # from the container records, extract the poduid/cname, get the values from poduid_lookup, and aggregate based on namespace_workload_cname container_records.each do |record| diff --git a/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb index 0c3f061f1..12c72a120 100644 --- a/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb +++ b/source/code/plugin/health/health_container_cpu_memory_record_formatter.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require 'yajl/json_gem' + module HealthModel class HealthContainerCpuMemoryRecordFormatter diff --git a/source/code/plugin/health/health_hierarchy_builder.rb b/source/code/plugin/health/health_hierarchy_builder.rb index bb48e083b..a59020996 100644 --- a/source/code/plugin/health/health_hierarchy_builder.rb +++ b/source/code/plugin/health/health_hierarchy_builder.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true +require 'yajl/json_gem' -require 'json' module HealthModel class HealthHierarchyBuilder diff --git a/source/code/plugin/health/health_model_definition_parser.rb b/source/code/plugin/health/health_model_definition_parser.rb index 91f8cd24f..c185e5389 100644 --- a/source/code/plugin/health/health_model_definition_parser.rb +++ b/source/code/plugin/health/health_model_definition_parser.rb @@ -3,7 +3,7 @@ Class to parse the health model definition. The definition expresses the relationship between monitors, how to roll up to an aggregate monitor, and what labels to "pass on" to the parent monitor =end -require 'json' +require 'yajl/json_gem' module HealthModel class HealthModelDefinitionParser diff --git a/source/code/plugin/health/health_monitor_optimizer.rb b/source/code/plugin/health/health_monitor_optimizer.rb index a63d59abf..d87540941 100644 --- a/source/code/plugin/health/health_monitor_optimizer.rb +++ b/source/code/plugin/health/health_monitor_optimizer.rb @@ -1,4 +1,5 @@ # frozen_string_literal: true +require 'yajl/json_gem' module HealthModel class HealthMonitorOptimizer #ctor diff --git a/source/code/plugin/health/health_monitor_provider.rb b/source/code/plugin/health/health_monitor_provider.rb index b36c46370..8e1d11143 100644 --- a/source/code/plugin/health/health_monitor_provider.rb +++ b/source/code/plugin/health/health_monitor_provider.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true require_relative 'health_model_constants' +require 'yajl/json_gem' module HealthModel class HealthMonitorProvider diff --git a/source/code/plugin/health/health_monitor_state.rb b/source/code/plugin/health/health_monitor_state.rb index 16f8bedc4..110793eeb 100644 --- a/source/code/plugin/health/health_monitor_state.rb +++ b/source/code/plugin/health/health_monitor_state.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true require_relative 'health_model_constants' +require 'yajl/json_gem' module HealthModel diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index 2fa2d3a52..13d1416b1 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -2,6 +2,7 @@ require 'logger' require 'digest' require_relative 'health_model_constants' +require 'yajl/json_gem' module HealthModel # static class that provides a bunch of utility methods diff --git a/source/code/plugin/health/unit_monitor.rb b/source/code/plugin/health/unit_monitor.rb index 6454007b6..8e2de210b 100644 --- a/source/code/plugin/health/unit_monitor.rb +++ b/source/code/plugin/health/unit_monitor.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true require_relative 'health_model_constants' -require 'json' +require 'yajl/json_gem' module HealthModel class UnitMonitor diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index 810fb512f..96aa66aa1 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -9,14 +9,15 @@ class CAdvisor_Perf_Input < Input def initialize super require "yaml" - require "json" + require 'yajl/json_gem' + require "time" require_relative "CAdvisorMetricsAPIClient" require_relative "oms_common" require_relative "omslog" end - config_param :run_interval, :time, :default => "1m" + config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.api.cadvisorperf" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" config_param :nodehealthtag, :string, :default => "kubehealth.DaemonSet.Node" @@ -46,10 +47,12 @@ def shutdown end def enumerate() - time = Time.now.to_f + currentTime = Time.now + time = currentTime.to_f + batchTime = currentTime.utc.iso8601 begin eventStream = MultiEventStream.new - metricData = CAdvisorMetricsAPIClient.getMetrics() + metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) metricData.each do |record| record["DataType"] = "LINUX_PERF_BLOB" record["IPName"] = "LogManagement" @@ -74,14 +77,25 @@ def enumerate() def run_periodic @mutex.lock done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval until done - @condition.wait(@mutex, @run_interval) + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished @mutex.unlock if !done begin - $log.info("in_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + $log.info("in_cadvisor_perf::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") enumerate + $log.info("in_cadvisor_perf::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn "in_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics: #{errorStr}" end diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index ccf61ab2e..d107047b4 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -13,14 +13,15 @@ class Container_Inventory_Input < Input def initialize super - require "json" + require 'yajl/json_gem' + require "time" require_relative "DockerApiClient" require_relative "ContainerInventoryState" require_relative "ApplicationInsightsUtility" require_relative "omslog" end - config_param :run_interval, :time, :default => "1m" + config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.containerinventory" def configure(conf) @@ -259,14 +260,25 @@ def enumerate def run_periodic @mutex.lock done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval until done - @condition.wait(@mutex, @run_interval) + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished @mutex.unlock if !done begin - $log.info("in_container_inventory::run_periodic @ #{Time.now.utc.iso8601}") + $log.info("in_container_inventory::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") enumerate + $log.info("in_container_inventory::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn "in_container_inventory::run_periodic: Failed in enumerate container inventory: #{errorStr}" end diff --git a/source/code/plugin/in_containerlog_sudo_tail.rb b/source/code/plugin/in_containerlog_sudo_tail.rb deleted file mode 100644 index 8faa260d0..000000000 --- a/source/code/plugin/in_containerlog_sudo_tail.rb +++ /dev/null @@ -1,189 +0,0 @@ - -require 'yajl' -require 'fluent/input' -require 'fluent/event' -require 'fluent/config/error' -require 'fluent/parser' -require 'open3' -require 'json' -require_relative 'omslog' -require_relative 'KubernetesApiClient' - -module Fluent - class ContainerLogSudoTail < Input - Plugin.register_input('containerlog_sudo_tail', self) - - def initialize - super - @command = nil - @paths = [] - #Using this to construct the file path for all every container json log file. - #Example container log file path -> /var/lib/docker/containers/{ContainerID}/{ContainerID}-json.log - #We have read permission on this file but don't have execute permission on the below mentioned path. Hence wildcard character searches to find the container ID's doesn't work. - @containerLogFilePath = "/var/lib/docker/containers/" - #This folder contains a list of all the containers running/stopped and we're using it to get all the container ID's which will be needed for the log file path below - #TODO : Use generic path from docker REST endpoint and find a way to mount the correct folder in the omsagent.yaml - @containerIDFilePath = "/var/opt/microsoft/docker-cimprov/state/ContainerInventory/*" - @@systemPodsNamespace = 'kube-system' - @@getSystemPodsTimeIntervalSecs = 300 #refresh system container list every 5 minutes - @@lastSystemPodsGetTime = nil; - @@systemContainerIDList = Hash.new - @@disableKubeSystemLogCollection = ENV['DISABLE_KUBE_SYSTEM_LOG_COLLECTION'] - if !@@disableKubeSystemLogCollection.nil? && !@@disableKubeSystemLogCollection.empty? && @@disableKubeSystemLogCollection.casecmp('true') == 0 - @@disableKubeSystemLogCollection = 'true' - $log.info("in_container_sudo_tail : System container log collection is disabled") - else - @@disableKubeSystemLogCollection = 'false' - $log.info("in_container_sudo_tail : System container log collection is enabled") - end - end - - attr_accessor :command - - #The format used to map the program output to the incoming event. - config_param :format, :string, default: 'none' - - #Tag of the event. - config_param :tag, :string, default: nil - - #Fluentd will record the position it last read into this file. - config_param :pos_file, :string, default: nil - - #The interval time between periodic program runs. - config_param :run_interval, :time, default: nil - - BASE_DIR = File.dirname(File.expand_path('..', __FILE__)) - RUBY_DIR = BASE_DIR + '/ruby/bin/ruby ' - TAILSCRIPT = BASE_DIR + '/plugin/containerlogtailfilereader.rb ' - - def configure(conf) - super - unless @pos_file - raise ConfigError, "'pos_file' is required to keep track of file" - end - - unless @tag - raise ConfigError, "'tag' is required on sudo tail" - end - - unless @run_interval - raise ConfigError, "'run_interval' is required for periodic tailing" - end - - @parser = Plugin.new_parser(conf['format']) - @parser.configure(conf) - end - - def start - @finished = false - @thread = Thread.new(&method(:run_periodic)) - end - - def shutdown - @finished = true - @thread.join - end - - def receive_data(line) - es = MultiEventStream.new - begin - line.chomp! # remove \n - @parser.parse(line) { |time, record| - if time && record - es.add(time, record) - else - $log.warn "pattern doesn't match: #{line.inspect}" - end - unless es.empty? - tag=@tag - router.emit_stream(tag, es) - end - } - rescue => e - $log.warn line.dump, error: e.to_s - $log.debug_backtrace(e.backtrace) - end - end - - def receive_log(line) - $log.warn "#{line}" if line.start_with?('WARN') - $log.error "#{line}" if line.start_with?('ERROR') - $log.info "#{line}" if line.start_with?('INFO') - end - - def readable_path(path) - if system("sudo test -r #{path}") - OMS::Log.info_once("Following tail of #{path}") - return path - else - OMS::Log.warn_once("#{path} is not readable. Cannot tail the file.") - return "" - end - end - - def set_system_command - timeNow = DateTime.now - cName = "Unkown" - tempContainerInfo = {} - paths = "" - - #if we are on agent & system containers log collection is disabled, get system containerIDs to exclude logs from containers in system containers namespace from being tailed - if !KubernetesApiClient.isNodeMaster && @@disableKubeSystemLogCollection.casecmp('true') == 0 - if @@lastSystemPodsGetTime.nil? || ((timeNow - @@lastSystemPodsGetTime)*24*60*60).to_i >= @@getSystemPodsTimeIntervalSecs - $log.info("in_container_sudo_tail : System Container list last refreshed at #{@@lastSystemPodsGetTime} - refreshing now at #{timeNow}") - sysContainers = KubernetesApiClient.getContainerIDs(@@systemPodsNamespace) - #BugBug - https://msecg.visualstudio.com/OMS/_workitems/edit/215107 - we get 200 with empty payloaf from time to time - if (!sysContainers.nil? && !sysContainers.empty?) - @@systemContainerIDList = sysContainers - else - $log.info("in_container_sudo_tail : System Container ID List is empty!!!! Continuing to use currently cached list.") - end - @@lastSystemPodsGetTime = timeNow - $log.info("in_container_sudo_tail : System Container ID List: #{@@systemContainerIDList}") - end - end - - Dir.glob(@containerIDFilePath).select { |p| - cName = p.split('/').last; - if !@@systemContainerIDList.key?("docker://" + cName) - p = @containerLogFilePath + cName + "/" + cName + "-json.log" - paths += readable_path(p) + " " - else - $log.info("in_container_sudo_tail : Excluding system container with ID #{cName} from tailng for log collection") - end - } - if !system("sudo test -r #{@pos_file}") - system("sudo touch #{@pos_file}") - end - @command = "sudo " << RUBY_DIR << TAILSCRIPT << paths << " -p #{@pos_file}" - end - - def run_periodic - until @finished - begin - sleep @run_interval - #if we are on master & system containers log collection is disabled, collect nothing (i.e NO COntainer log collection for ANY container) - #we will be not collection omsagent log as well in this case, but its insignificant & okay! - if !KubernetesApiClient.isNodeMaster || @@disableKubeSystemLogCollection.casecmp('true') != 0 - set_system_command - Open3.popen3(@command) {|writeio, readio, errio, wait_thread| - writeio.close - while line = readio.gets - receive_data(line) - end - while line = errio.gets - receive_log(line) - end - - wait_thread.value #wait until child process terminates - } - end - rescue - $log.error "containerlog_sudo_tail failed to run or shutdown child proces", error => $!.to_s, :error_class => $!.class.to_s - $log.warn_backtrace $!.backtrace - end - end - end - end - -end diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index e1fdc5df6..6116cb62d 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -9,15 +9,20 @@ class Kube_Event_Input < Input def initialize super - require "json" + require "yajl/json_gem" + require "yajl" + require "time" require_relative "KubernetesApiClient" require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" + + # 30000 events account to approximately 5MB + @EVENTS_CHUNK_SIZE = 30000 end - config_param :run_interval, :time, :default => "1m" + config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.KubeEvents" def configure(conf) @@ -43,79 +48,114 @@ def shutdown end end - def enumerate(eventList = nil) - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 + def enumerate + begin + eventList = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + eventQueryState = getEventQueryState + newEventQueryState = [] + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}") + $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") + if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) + else + $log.warn "in_kube_events::enumerate:Received empty eventList" + end - events = eventList - $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") - eventInfo = KubernetesApiClient.getKubeResourceInfo("events") - $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}&continue=#{continuationToken}") + if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) + else + $log.warn "in_kube_events::enumerate:Received empty eventList" + end + end - if !eventInfo.nil? - events = JSON.parse(eventInfo.body) + # Setting this to nil so that we dont hold memory until GC kicks in + eventList = nil + writeEventQueryState(newEventQueryState) + rescue => errorStr + $log.warn "in_kube_events::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end + end # end enumerate - eventQueryState = getEventQueryState - newEventQueryState = [] + def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) + currentTime = Time.now + emitTime = currentTime.to_f begin - if (!events.nil? && !events.empty? && !events["items"].nil?) - eventStream = MultiEventStream.new - events["items"].each do |items| - record = {} - # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - eventId = items["metadata"]["uid"] + "/" + items["count"].to_s - newEventQueryState.push(eventId) - if !eventQueryState.empty? && eventQueryState.include?(eventId) - next - end - record["ObjectKind"] = items["involvedObject"]["kind"] - record["Namespace"] = items["involvedObject"]["namespace"] - record["Name"] = items["involvedObject"]["name"] - record["Reason"] = items["reason"] - record["Message"] = items["message"] - record["Type"] = items["type"] - record["TimeGenerated"] = items["metadata"]["creationTimestamp"] - record["SourceComponent"] = items["source"]["component"] - record["FirstSeen"] = items["firstTimestamp"] - record["LastSeen"] = items["lastTimestamp"] - record["Count"] = items["count"] - if items["source"].key?("host") - record["Computer"] = items["source"]["host"] - else - record["Computer"] = (OMS::Common.get_hostname) - end - record['ClusterName'] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - wrapper = { - "DataType" => "KUBE_EVENTS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + eventStream = MultiEventStream.new + events["items"].each do |items| + record = {} + # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + eventId = items["metadata"]["uid"] + "/" + items["count"].to_s + newEventQueryState.push(eventId) + if !eventQueryState.empty? && eventQueryState.include?(eventId) + next end - router.emit_stream(@tag, eventStream) if eventStream - end - writeEventQueryState(newEventQueryState) + record["ObjectKind"] = items["involvedObject"]["kind"] + record["Namespace"] = items["involvedObject"]["namespace"] + record["Name"] = items["involvedObject"]["name"] + record["Reason"] = items["reason"] + record["Message"] = items["message"] + record["Type"] = items["type"] + record["TimeGenerated"] = items["metadata"]["creationTimestamp"] + record["SourceComponent"] = items["source"]["component"] + record["FirstSeen"] = items["firstTimestamp"] + record["LastSeen"] = items["lastTimestamp"] + record["Count"] = items["count"] + if items["source"].key?("host") + record["Computer"] = items["source"]["host"] + else + record["Computer"] = (OMS::Common.get_hostname) + end + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + wrapper = { + "DataType" => "KUBE_EVENTS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + end + router.emit_stream(@tag, eventStream) if eventStream rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + end + return newEventQueryState end def run_periodic @mutex.lock done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval until done - @condition.wait(@mutex, @run_interval) + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished @mutex.unlock if !done begin - $log.info("in_kube_events::run_periodic @ #{Time.now.utc.iso8601}") + $log.info("in_kube_events::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") enumerate + $log.info("in_kube_events::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn "in_kube_events::run_periodic: enumerate Failed to retrieve kube events: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 57ca07f64..0eebf395b 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -21,19 +21,22 @@ def initialize begin super require "yaml" - require "json" + require 'yajl/json_gem' + require "yajl" + require "time" @@cluster_id = KubernetesApiClient.getClusterId @resources = HealthKubernetesResources.instance @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) @@ApiGroupApps = "apps" + @@KubeInfraNamespace = "kube-system" rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end end include HealthModel - config_param :run_interval, :time, :default => "1m" + config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "kubehealth.ReplicaSet" def configure(conf) @@ -83,10 +86,11 @@ def enumerate #HealthMonitorUtils.refresh_kubernetes_api_data(@@hmlog, nil) # we do this so that if the call fails, we get a response code/header etc. node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes") - node_inventory = JSON.parse(node_inventory_response.body) - pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") - pod_inventory = JSON.parse(pod_inventory_response.body) - replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_group: @@ApiGroupApps).body) + node_inventory = Yajl::Parser.parse(StringIO.new(node_inventory_response.body)) + pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}") + pod_inventory = Yajl::Parser.parse(StringIO.new(pod_inventory_response.body)) + replicaset_inventory_response = KubernetesApiClient.getKubeResourceInfo("replicasets?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}", api_group: @@ApiGroupApps) + replicaset_inventory = Yajl::Parser.parse(StringIO.new(replicaset_inventory_response.body)) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory @@ -108,8 +112,8 @@ def enumerate health_monitor_records.push(record) if record pods_ready_hash = HealthMonitorUtils.get_pods_ready_hash(@resources) - system_pods = pods_ready_hash.select { |k, v| v["namespace"] == "kube-system" } - workload_pods = pods_ready_hash.select { |k, v| v["namespace"] != "kube-system" } + system_pods = pods_ready_hash.keep_if { |k, v| v["namespace"] == @@KubeInfraNamespace } + workload_pods = Hash.new # pods_ready_hash.select{ |k, v| v["namespace"] != @@KubeInfraNamespace } system_pods_ready_percentage_records = process_pods_ready_percentage(system_pods, MonitorId::SYSTEM_WORKLOAD_PODS_READY_MONITOR_ID) system_pods_ready_percentage_records.each do |record| @@ -225,28 +229,28 @@ def process_pods_ready_percentage(pods_hash, config_monitor_id) hmlog = HealthMonitorUtils.get_log_handle records = [] - pods_hash.keys.each do |key| - workload_name = key - total_pods = pods_hash[workload_name]["totalPods"] - pods_ready = pods_hash[workload_name]["podsReady"] - namespace = pods_hash[workload_name]["namespace"] - workload_kind = pods_hash[workload_name]["kind"] - percent = pods_ready / total_pods * 100 - timestamp = Time.now.utc.iso8601 - - state = HealthMonitorUtils.compute_percentage_state(percent, monitor_config) - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"totalPods" => total_pods, "podsReady" => pods_ready, "workload_name" => workload_name, "namespace" => namespace, "workload_kind" => workload_kind}} - monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(config_monitor_id, [@@cluster_id, namespace, workload_name]) - health_record = {} - time_now = Time.now.utc.iso8601 - health_record[HealthMonitorRecordFields::MONITOR_ID] = config_monitor_id - health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id - health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record - health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now - health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now - health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id - records.push(health_record) - end + pods_hash.keys.each do |key| + workload_name = key + total_pods = pods_hash[workload_name]["totalPods"] + pods_ready = pods_hash[workload_name]["podsReady"] + namespace = pods_hash[workload_name]["namespace"] + workload_kind = pods_hash[workload_name]["kind"] + percent = pods_ready / total_pods * 100 + timestamp = Time.now.utc.iso8601 + + state = HealthMonitorUtils.compute_percentage_state(percent, monitor_config) + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"totalPods" => total_pods, "podsReady" => pods_ready, "workload_name" => workload_name, "namespace" => namespace, "workload_kind" => workload_kind}} + monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(config_monitor_id, [@@cluster_id, namespace, workload_name]) + health_record = {} + time_now = Time.now.utc.iso8601 + health_record[HealthMonitorRecordFields::MONITOR_ID] = config_monitor_id + health_record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] = monitor_instance_id + health_record[HealthMonitorRecordFields::DETAILS] = health_monitor_record + health_record[HealthMonitorRecordFields::TIME_GENERATED] = time_now + health_record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED] = time_now + health_record[HealthMonitorRecordFields::CLUSTER_ID] = @@cluster_id + records.push(health_record) + end #@@hmlog.info "Successfully processed pods_ready_percentage for #{config_monitor_id} #{records.size}" return records end @@ -296,10 +300,11 @@ def process_node_condition_monitor(node_inventory) def initialize_inventory #this is required because there are other components, like the container cpu memory aggregator, that depends on the mapping being initialized node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes") - node_inventory = JSON.parse(node_inventory_response.body) - pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods") - pod_inventory = JSON.parse(pod_inventory_response.body) - replicaset_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("replicasets", api_group: @@ApiGroupApps).body) + node_inventory = Yajl::Parser.parse(StringIO.new(node_inventory_response.body)) + pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}") + pod_inventory = Yajl::Parser.parse(StringIO.new(pod_inventory_response.body)) + replicaset_inventory_response = KubernetesApiClient.getKubeResourceInfo("replicasets?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}", api_group: @@ApiGroupApps) + replicaset_inventory = Yajl::Parser.parse(StringIO.new(replicaset_inventory_response.body)) @resources.node_inventory = node_inventory @resources.pod_inventory = pod_inventory @@ -310,14 +315,25 @@ def initialize_inventory def run_periodic @mutex.lock done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval until done - @condition.wait(@mutex, @run_interval) + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished @mutex.unlock if !done begin - @@hmlog.info("in_kube_health::run_periodic @ #{Time.now.utc.iso8601}") + @@hmlog.info("in_kube_health::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") enumerate + @@hmlog.info("in_kube_health::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") rescue => errorStr @@hmlog.warn "in_kube_health::run_periodic: enumerate Failed for kubeapi sourced data health: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/code/plugin/in_kube_logs.rb b/source/code/plugin/in_kube_logs.rb deleted file mode 100644 index 119473819..000000000 --- a/source/code/plugin/in_kube_logs.rb +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/local/bin/ruby -# frozen_string_literal: true - -module Fluent - - class Kube_Logs_Input < Input - Plugin.register_input('kubelogs', self) - - @@KubeLogsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml" - - def initialize - super - require 'yaml' - require 'date' - require 'time' - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'oms_common' - require_relative 'omslog' - end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.api.KubeLogs" - - def configure (conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end - end - - def enumerate(podList = nil) - - namespace = ENV['OMS_KUBERNETES_LOGS_NAMESPACE'] - if namespace.nil? || namespace.empty? - return - end - - time = Time.now.to_f - if podList.nil? - pods = KubernetesApiClient.getPods(namespace) - else - pods = podList - end - logQueryState = getLogQueryState - newLogQueryState = {} - - pods.each do |pod| - record = {} - begin - pod['status']['containerStatuses'].each do |container| - - # if container['state']['running'] - # puts container['name'] + ' is running' - # end - - timeStamp = DateTime.now - - containerId = pod['metadata']['namespace'] + "_" + pod['metadata']['name'] + "_" + container['name'] - if !logQueryState.empty? && logQueryState[containerId] - timeStamp = DateTime.parse(logQueryState[containerId]) - end - - # Try to get logs for the container - begin - $log.debug "Getting logs for #{container['name']}" - logs = KubernetesApiClient.getContainerLogsSinceTime(pod['metadata']['namespace'], pod['metadata']['name'], container['name'], timeStamp.rfc3339(9), true) - $log.debug "got something back" - - # By default we don't change the timestamp (if no logs were returned or if there was a (hopefully transient) error in retrieval - newLogQueryState[containerId] = timeStamp.rfc3339(9) - - if !logs || logs.empty? - $log.info "no logs returned" - else - $log.debug "response size is #{logs.length}" - lines = logs.split("\n") - index = -1 - - # skip duplicates - for i in 0...lines.count - dateTime = DateTime.parse(lines[i].split(" ").first) - if (dateTime.to_time - timeStamp.to_time) > 0.0 - index = i - break - end - end - - if index >= 0 - $log.debug "starting from line #{index}" - for i in index...lines.count - record['Namespace'] = pod['metadata']['namespace'] - record['Pod'] = pod['metadata']['name'] - record['Container'] = container['name'] - record['Message'] = lines[i][(lines[i].index(' ') + 1)..(lines[i].length - 1)] - record['TimeGenerated'] = lines[i].split(" ").first - record['Node'] = pod['spec']['nodeName'] - record['Computer'] = OMS::Common.get_hostname - record['ClusterName'] = KubernetesApiClient.getClusterName - router.emit(@tag, time, record) if record - end - newLogQueryState[containerId] = lines.last.split(" ").first - else - newLogQueryState[containerId] = DateTime.now.rfc3339(9) - end - end - rescue => logException - $log.warn "Failed to retrieve logs for container: #{logException}" - $log.debug_backtrace(logException.backtrace) - end - end - # Update log query state only if logging was succesfful. - # TODO: May have a few duplicate lines in case of - writeLogQueryState(newLogQueryState) - rescue => errorStr - $log.warn "Exception raised in enumerate: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - end - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - $log.debug "calling enumerate for KubeLogs" - enumerate - $log.debug "done with enumerate for KubeLogs" - end - @mutex.lock - end - @mutex.unlock - end - - def getLogQueryState - logQueryState = {} - begin - if File.file?(@@KubeLogsStateFile) - logQueryState = YAML.load_file(@@KubeLogsStateFile, {}) - end - rescue => errorStr - $log.warn "Failed to load query state #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - end - return logQueryState - end - - def writeLogQueryState(logQueryState) - begin - File.write(@@KubeLogsStateFile, logQueryState.to_yaml) - rescue => errorStr - $log.warn "Failed to write query state #{errorStr.to_s}" - $log.debug_backtrace(errorStr.backtrace) - end - end - - end # Kube_Log_Input - -end # module - diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 0a0fd9d2e..fa0994f43 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -9,6 +9,7 @@ class Kube_nodeInventory_Input < Input @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" + @@kubeperfTag = "oms.api.KubePerf" @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@ -21,15 +22,18 @@ class Kube_nodeInventory_Input < Input def initialize super require "yaml" - require "json" + require "yajl/json_gem" + require "yajl" + require "time" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" + @NODES_CHUNK_SIZE = "400" end - config_param :run_interval, :time, :default => "1m" + config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" def configure(conf) @@ -57,158 +61,217 @@ def shutdown end def enumerate - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - telemetrySent = false + begin + nodeInventory = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 - nodeInventory = nil + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken("nodes?limit=#{@NODES_CHUNK_SIZE}") + $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + parse_and_emit_records(nodeInventory, batchTime) + else + $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" + end - $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - nodeInfo = KubernetesApiClient.getKubeResourceInfo("nodes") - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken("nodes?limit=#{@NODES_CHUNK_SIZE}&continue=#{continuationToken}") + if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + parse_and_emit_records(nodeInventory, batchTime) + else + $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" + end + end - if !nodeInfo.nil? - nodeInventory = JSON.parse(nodeInfo.body) + # Setting this to nil so that we dont hold memory until GC kicks in + nodeInventory = nil + rescue => errorStr + $log.warn "in_kube_nodes::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end + end # end enumerate + def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) begin - if (!nodeInventory.nil? && !nodeInventory.empty?) - eventStream = MultiEventStream.new - containerNodeInventoryEventStream = MultiEventStream.new - if !nodeInventory["items"].nil? - #get node inventory - nodeInventory["items"].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Computer"] = items["metadata"]["name"] - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] - record["Labels"] = [items["metadata"]["labels"]] - record["Status"] = "" - - if !items["spec"]["providerID"].nil? && !items["spec"]["providerID"].empty? - if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack - record["KubernetesProviderID"] = "azurestack" - else - record["KubernetesProviderID"] = items["spec"]["providerID"] - end - else - record["KubernetesProviderID"] = "onprem" - end + currentTime = Time.now + emitTime = currentTime.to_f + telemetrySent = false + eventStream = MultiEventStream.new + containerNodeInventoryEventStream = MultiEventStream.new + #get node inventory + nodeInventory["items"].each do |items| + record = {} + # Sending records for ContainerNodeInventory + containerNodeInventoryRecord = {} + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. - - if items["status"].key?("conditions") && !items["status"]["conditions"].empty? - allNodeConditions = "" - items["status"]["conditions"].each do |condition| - if condition["status"] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition["type"] - else - allNodeConditions = condition["type"] - end - end - #collect last transition to/from ready (no matter ready is true/false) - if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? - record["LastTransitionTimeReady"] = condition["lastTransitionTime"] - end - end + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = items["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] + record["Labels"] = [items["metadata"]["labels"]] + record["Status"] = "" + + if !items["spec"]["providerID"].nil? && !items["spec"]["providerID"].empty? + if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack + record["KubernetesProviderID"] = "azurestack" + else + record["KubernetesProviderID"] = items["spec"]["providerID"] + end + else + record["KubernetesProviderID"] = "onprem" + end + + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. + + if items["status"].key?("conditions") && !items["status"]["conditions"].empty? + allNodeConditions = "" + items["status"]["conditions"].each do |condition| + if condition["status"] == "True" if !allNodeConditions.empty? - record["Status"] = allNodeConditions + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] end end - - nodeInfo = items["status"]["nodeInfo"] - record["KubeletVersion"] = nodeInfo["kubeletVersion"] - record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] - containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] - dockerVersion = nodeInfo["containerRuntimeVersion"] - dockerVersion.slice! "docker://" - containerNodeInventoryRecord["DockerVersion"] = dockerVersion - # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryWrapper = { - "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper - - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - # Adding telemetry to send node telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 10) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - properties["OperatingSystem"] = nodeInfo["operatingSystem"] - properties["DockerVersion"] = dockerVersion - properties["KubernetesProviderID"] = record["KubernetesProviderID"] - properties["KernelVersion"] = nodeInfo["kernelVersion"] - properties["OSImage"] = nodeInfo["osImage"] - - capacityInfo = items["status"]["capacity"] - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength - end - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] end end + if !allNodeConditions.empty? + record["Status"] = allNodeConditions + end end - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - if telemetrySent == true - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + + nodeInfo = items["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + dockerVersion = nodeInfo["containerRuntimeVersion"] + dockerVersion.slice! "docker://" + containerNodeInventoryRecord["DockerVersion"] = dockerVersion + # ContainerNodeInventory data for docker version and operating system. + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + # Adding telemetry to send node telemetry every 10 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 10) + properties = {} + properties["Computer"] = record["Computer"] + properties["KubeletVersion"] = record["KubeletVersion"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] + properties["DockerVersion"] = dockerVersion + properties["KubernetesProviderID"] = record["KubernetesProviderID"] + properties["KernelVersion"] = nodeInfo["kernelVersion"] + properties["OSImage"] = nodeInfo["osImage"] + + capacityInfo = items["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true end - @@istestvar = ENV["ISTEST"] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + if telemetrySent == true + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + end + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + #:optimize:kubeperf merge + begin + #if(!nodeInventory.empty?) + nodeMetricDataItems = [] + #allocatable metrics @ node level + nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime)) + nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "memory", "memoryAllocatableBytes", batchTime)) + #capacity metrics @ node level + nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores", batchTime)) + nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes", batchTime)) + + kubePerfEventStream = MultiEventStream.new + + nodeMetricDataItems.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, record) if record end + #end + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + rescue => errorStr + $log.warn "Failed in enumerate for KubePerf from in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end + #:optimize:end kubeperf merge + rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end + $log.warn "in_kube_nodes::parse_and_emit_records:End #{Time.now.utc.iso8601}" end def run_periodic @mutex.lock done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval until done - @condition.wait(@mutex, @run_interval) + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished @mutex.unlock if !done begin - $log.info("in_kube_nodes::run_periodic @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::run_periodic.enumerate.start #{Time.now.utc.iso8601}") enumerate + $log.info("in_kube_nodes::run_periodic.enumerate.end #{Time.now.utc.iso8601}") rescue => errorStr $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/code/plugin/in_kube_perf.rb b/source/code/plugin/in_kube_perf.rb deleted file mode 100644 index 8b571139d..000000000 --- a/source/code/plugin/in_kube_perf.rb +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/local/bin/ruby -# frozen_string_literal: true - -module Fluent - - class Kube_Perf_Input < Input - Plugin.register_input('kubeperf', self) - - def initialize - super - require 'yaml' - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'oms_common' - require_relative 'omslog' - end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.api.KubePerf" - - def configure (conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end - end - - def enumerate() - time = Time.now.to_f - begin - eventStream = MultiEventStream.new - - $log.info("in_kube_perf::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - #get resource requests & resource limits per container as perf data - podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('pods').body) - $log.info("in_kube_perf::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - if(!podInventory.empty?) - containerMetricDataItems = [] - hostName = (OMS::Common.get_hostname) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "cpu","cpuRequestNanoCores")) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "memory","memoryRequestBytes")) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "cpu","cpuLimitNanoCores")) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory","memoryLimitBytes")) - - containerMetricDataItems.each do |record| - record['DataType'] = "LINUX_PERF_BLOB" - record['IPName'] = "LogManagement" - eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end - end - - #get allocatable limits per node as perf data - # Node capacity is different from node allocatable. Allocatable is what is avaialble for allocating pods. - # In theory Capacity = Allocatable + kube-reserved + system-reserved + eviction-threshold - # For more details refer to https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable - $log.info("in_kube_perf::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('nodes').body) - $log.info("in_kube_perf::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - if(!nodeInventory.empty?) - nodeMetricDataItems = [] - #allocatable metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "cpu", "cpuAllocatableNanoCores")) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "memory", "memoryAllocatableBytes")) - #capacity metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores")) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes")) - - nodeMetricDataItems.each do |record| - record['DataType'] = "LINUX_PERF_BLOB" - record['IPName'] = "LogManagement" - eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end - end - router.emit_stream(@tag, eventStream) if eventStream - rescue => errorStr - $log.warn "Failed to retrieve metric data: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kube_perf::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_kube_perf::run_periodic: enumerate Failed to retrieve kube perf metrics: #{errorStr}" - end - end - @mutex.lock - end - @mutex.unlock - end - end # Kube_Perf_Input -end # module diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 1dd029b22..28b20bfc0 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -7,20 +7,30 @@ class Kube_PodInventory_Input < Input @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) + @@kubeperfTag = "oms.api.KubePerf" + @@kubeservicesTag = "oms.containerinsights.KubeServices" def initialize super require "yaml" - require "json" + require "yajl/json_gem" + require "yajl" require "set" + require "time" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" + + @PODS_CHUNK_SIZE = "1500" + @podCount = 0 + @controllerSet = Set.new [] + @winContainerCount = 0 + @controllerData = {} end - config_param :run_interval, :time, :default => "1m" + config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" def configure(conf) @@ -48,33 +58,77 @@ def shutdown end def enumerate(podList = nil) - podInventory = podList - $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo = KubernetesApiClient.getKubeResourceInfo("pods") - $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + begin + podInventory = podList + telemetryFlush = false + @podCount = 0 + @controllerSet = Set.new [] + @winContainerCount = 0 + @controllerData = {} + currentTime = Time.now + batchTime = currentTime.utc.iso8601 - if !podInfo.nil? - podInventory = JSON.parse(podInfo.body) - end + # Get services first so that we dont need to make a call for very chunk + $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") + serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") + # serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - begin - if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].empty?) - #get pod inventory & services - $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = nil - serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") - - if !serviceInfo.nil? - serviceList = JSON.parse(serviceInfo.body) - end - - $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceList) + if !serviceInfo.nil? + $log.info("in_kube_podinventory::enumerate:Start:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) + $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") + serviceInfo = nil + end + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") + $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + parse_and_emit_records(podInventory, serviceList, batchTime) else - $log.warn "Received empty podInventory" + $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) + parse_and_emit_records(podInventory, serviceList, batchTime) + else + $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" + end + end + + # Setting these to nil so that we dont hold memory until GC kicks in + podInventory = nil + serviceList = nil + + # Adding telemetry to send pod telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + telemetryFlush = true + end + + # Flush AppInsights telemetry once all the processing is done + if telemetryFlush == true + telemetryProperties = {} + telemetryProperties["Computer"] = @@hostName + ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) + telemetryProperties["ControllerData"] = @controllerData.to_json + ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) + if @winContainerCount > 0 + telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount + ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) + end + @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr - $log.warn "Failed in enumerate pod inventory: #{errorStr}" + $log.warn "in_kube_podinventory::enumerate:Failed in enumerate: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end @@ -192,15 +246,12 @@ def getContainerEnvironmentVariables(pod, clusterCollectEnvironmentVar) end end - def parse_and_emit_records(podInventory, serviceList) + def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 + #batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new - controllerSet = Set.new [] - controllerData = {} - telemetryFlush = false - winContainerCount = 0 + begin #begin block start # Getting windows nodes from kubeapi winNodes = KubernetesApiClient.getWindowsNodesArray @@ -283,24 +334,17 @@ def parse_and_emit_records(podInventory, serviceList) record["ClusterId"] = KubernetesApiClient.getClusterId record["ClusterName"] = KubernetesApiClient.getClusterName record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) - # Adding telemetry to send pod telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) - telemetryFlush = true - end + if !items["metadata"]["ownerReferences"].nil? record["ControllerKind"] = items["metadata"]["ownerReferences"][0]["kind"] record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] - if telemetryFlush == true - controllerSet.add(record["ControllerKind"] + record["ControllerName"]) - #Adding controller kind to telemetry ro information about customer workload - if (controllerData[record["ControllerKind"]].nil?) - controllerData[record["ControllerKind"]] = 1 - else - controllerValue = controllerData[record["ControllerKind"]] - controllerData[record["ControllerKind"]] += 1 - end + @controllerSet.add(record["ControllerKind"] + record["ControllerName"]) + #Adding controller kind to telemetry ro information about customer workload + if (@controllerData[record["ControllerKind"]].nil?) + @controllerData[record["ControllerKind"]] = 1 + else + controllerValue = @controllerData[record["ControllerKind"]] + @controllerData[record["ControllerKind"]] += 1 end end podRestartCount = 0 @@ -418,7 +462,7 @@ def parse_and_emit_records(podInventory, serviceList) end end # Send container inventory records for containers on windows nodes - winContainerCount += containerInventoryRecords.length + @winContainerCount += containerInventoryRecords.length containerInventoryRecords.each do |cirecord| if !cirecord.nil? ciwrapper = { @@ -433,19 +477,66 @@ def parse_and_emit_records(podInventory, serviceList) router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@@MDMKubePodInventoryTag, eventStream) if eventStream - if telemetryFlush == true - telemetryProperties = {} - telemetryProperties["Computer"] = @@hostName - ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) - ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {}) - telemetryProperties["ControllerData"] = controllerData.to_json - ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, telemetryProperties) - if winContainerCount > 0 - telemetryProperties["ClusterWideWindowsContainersCount"] = winContainerCount - ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) + #:optimize:kubeperf merge + begin + #if(!podInventory.empty?) + containerMetricDataItems = [] + #hostName = (OMS::Common.get_hostname) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "cpu", "cpuRequestNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "memory", "memoryRequestBytes", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "cpu", "cpuLimitNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory", "memoryLimitBytes", batchTime)) + + kubePerfEventStream = MultiEventStream.new + + containerMetricDataItems.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, record) if record end - @@podTelemetryTimeTracker = DateTime.now.to_time.to_i + #end + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + rescue => errorStr + $log.warn "Failed in parse_and_emit_record for KubePerf from in_kube_podinventory : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end + #:optimize:end kubeperf merge + + #:optimize:start kubeservices merge + begin + if (!serviceList.nil? && !serviceList.empty?) + kubeServicesEventStream = MultiEventStream.new + serviceList["items"].each do |items| + kubeServiceRecord = {} + kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + kubeServiceRecord["ServiceName"] = items["metadata"]["name"] + kubeServiceRecord["Namespace"] = items["metadata"]["namespace"] + kubeServiceRecord["SelectorLabels"] = [items["spec"]["selector"]] + kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId + kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServiceRecord["ClusterIP"] = items["spec"]["clusterIP"] + kubeServiceRecord["ServiceType"] = items["spec"]["type"] + # : Add ports and status fields + kubeServicewrapper = { + "DataType" => "KUBE_SERVICES_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [kubeServiceRecord.each { |k, v| kubeServiceRecord[k] = v }], + } + kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper + end + router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + end + rescue => errorStr + $log.warn "Failed in parse_and_emit_record for KubeServices from in_kube_podinventory : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #:optimize:end kubeservices merge + + #Updating value for AppInsights telemetry + @podCount += podInventory["items"].length + @@istestvar = ENV["ISTEST"] if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -460,14 +551,25 @@ def parse_and_emit_records(podInventory, serviceList) def run_periodic @mutex.lock done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval until done - @condition.wait(@mutex, @run_interval) + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished @mutex.unlock if !done begin - $log.info("in_kube_podinventory::run_periodic @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") enumerate + $log.info("in_kube_podinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") rescue => errorStr $log.warn "in_kube_podinventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/code/plugin/in_kube_services.rb b/source/code/plugin/in_kube_services.rb deleted file mode 100644 index 7cd703620..000000000 --- a/source/code/plugin/in_kube_services.rb +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/local/bin/ruby -# frozen_string_literal: true - -module Fluent - class Kube_Services_Input < Input - Plugin.register_input("kubeservices", self) - - def initialize - super - require "yaml" - require "json" - - require_relative "KubernetesApiClient" - require_relative "oms_common" - require_relative "omslog" - require_relative "ApplicationInsightsUtility" - end - - config_param :run_interval, :time, :default => "1m" - config_param :tag, :string, :default => "oms.containerinsights.KubeServices" - - def configure(conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end - end - - def enumerate - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - - serviceList = nil - - $log.info("in_kube_services::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") - $log.info("in_kube_services::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - - if !serviceInfo.nil? - serviceList = JSON.parse(serviceInfo.body) - end - - begin - if (!serviceList.nil? && !serviceList.empty?) - eventStream = MultiEventStream.new - serviceList["items"].each do |items| - record = {} - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["ServiceName"] = items["metadata"]["name"] - record["Namespace"] = items["metadata"]["namespace"] - record["SelectorLabels"] = [items["spec"]["selector"]] - record["ClusterId"] = KubernetesApiClient.getClusterId - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterIP"] = items["spec"]["clusterIP"] - record["ServiceType"] = items["spec"]["type"] - # : Add ports and status fields - wrapper = { - "DataType" => "KUBE_SERVICES_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - end - router.emit_stream(@tag, eventStream) if eventStream - end - rescue => errorStr - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kube_services::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_kube_services::run_periodic: enumerate Failed to kube services: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - @mutex.lock - end - @mutex.unlock - end - end # Kube_Services_Input -end # module diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 2e5f839e6..695a686cf 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -10,7 +10,8 @@ class Win_CAdvisor_Perf_Input < Input def initialize super require "yaml" - require "json" + require 'yajl/json_gem' + require "time" require_relative "CAdvisorMetricsAPIClient" require_relative "KubernetesApiClient" @@ -18,7 +19,7 @@ def initialize require_relative "omslog" end - config_param :run_interval, :time, :default => "1m" + config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.api.wincadvisorperf" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" @@ -60,13 +61,13 @@ def enumerate() $log.info "in_win_cadvisor_perf: Getting windows nodes" nodes = KubernetesApiClient.getWindowsNodes() if !nodes.nil? - @@winNodes = KubernetesApiClient.getWindowsNodes() + @@winNodes = nodes end $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end @@winNodes.each do |winNode| - metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) + metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601) metricData.each do |record| if !record.empty? record["DataType"] = "LINUX_PERF_BLOB" @@ -100,14 +101,25 @@ def enumerate() def run_periodic @mutex.lock done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval until done - @condition.wait(@mutex, @run_interval) + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished @mutex.unlock if !done begin - $log.info("in_win_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + $log.info("in_win_cadvisor_perf::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") enumerate + $log.info("in_win_cadvisor_perf::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") rescue => errorStr $log.warn "in_win_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics for windows nodes: #{errorStr}" end diff --git a/source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb b/source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb index 8f4677044..60838e215 100644 --- a/source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb +++ b/source/code/plugin/lib/application_insights/channel/contracts/json_serializable.rb @@ -1,4 +1,4 @@ -require 'json' +require 'yajl/json_gem' module ApplicationInsights module Channel diff --git a/source/code/plugin/lib/application_insights/channel/sender_base.rb b/source/code/plugin/lib/application_insights/channel/sender_base.rb index 2431bf748..004b4722f 100644 --- a/source/code/plugin/lib/application_insights/channel/sender_base.rb +++ b/source/code/plugin/lib/application_insights/channel/sender_base.rb @@ -1,4 +1,4 @@ -require 'json' +require 'yajl/json_gem' require 'net/http' require 'openssl' require 'stringio' diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index b8d10090d..0a4e601b2 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -12,7 +12,7 @@ def initialize require "net/http" require "net/https" require "uri" - require "json" + require 'yajl/json_gem' require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" From 852680238a0675e67de45ccd5ba55b3f6610706c Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 3 Dec 2019 16:41:01 -0800 Subject: [PATCH 146/160] Update Readme --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 4674700c4..ff3e2890c 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,31 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 12/03/2019 - +##### Version microsoft/oms:ciprod12032019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod12032019 +- Fix scheduler for all input plugins +- Fix liveness probe +- Reduce chunk sizes for all fluentD buffers to support larger clusters (nodes & pods) +- Chunk Kubernetes API calls (pods,nodes,events) +- Use HTTP.start instead of HTTP.new +- Merge KubePerf into KubePods & KubeNodes +- Merge KubeServices into KubePod +- Use stream based yajl for JSON parsing +- Health - Query only kube-system pods +- Health - Use keep_if instead of select +- Container log enrichment (turned OFF by default for TimeOfCommand, ContainerName & ContainerImage) +- Application Insights Telemetry - Async +- Fix metricTime to be batch time for all metric input plugins +- Close socket connections properly for DockerAPIClient +- Fix top un handled exceptions in Kubernetes API Client and pod inventory +- Fix retries, wait between retries, chunk size, thread counts to be consistent for all FluentD workflows +- Back-off for containerlog enrichment K8S API calls +- Add new regions (3) for Azure Monitor Custom metrics +- Increase the cpu & memory limits for replica-set to support larger clusters (nodes & pods) +- Move to Ubuntu 18.04 LTS +- Support for Kubernetes 1.16 +- Use ifconfig for detecting network connectivity issues + ### 10/11/2019 - ##### Version microsoft/oms:ciprod10112019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10112019 - Update prometheus config scraping capability to restrict collecting metrics from pods in specific namespaces. From c766d73ccbc55e3098a72f3c6b55a7c68ed06bab Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 4 Dec 2019 11:46:18 -0800 Subject: [PATCH 147/160] add back timeofcommand (#310) --- source/code/go/src/plugins/oms.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 834726c93..8dfaf0e7e 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -761,12 +761,13 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { Image: stringMap["Image"], Name: stringMap["Name"], } - } else { // dont collect timeofcommand field as its part of container log enrivhment + } else { // dont collect timeofcommand field as its part of container log enrichment [But currently we dont know the ux behavior , so waiting for ux fix (LA ux)] dataItem = DataItem{ ID: stringMap["Id"], LogEntry: stringMap["LogEntry"], LogEntrySource: stringMap["LogEntrySource"], LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + LogEntryTimeOfCommand: start.Format(time.RFC3339), SourceSystem: stringMap["SourceSystem"], Computer: Computer, Image: stringMap["Image"], From 8dfa313161f17151b25040870797d0f4938b20df Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 4 Dec 2019 12:06:09 -0800 Subject: [PATCH 148/160] update readme for timeofcommand fix (#314) --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ff3e2890c..49c6d1fe4 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,8 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 12/03/2019 - -##### Version microsoft/oms:ciprod12032019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod12032019 +### 12/04/2019 - +##### Version microsoft/oms:ciprod12042019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod12042019 - Fix scheduler for all input plugins - Fix liveness probe - Reduce chunk sizes for all fluentD buffers to support larger clusters (nodes & pods) @@ -23,7 +23,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Use stream based yajl for JSON parsing - Health - Query only kube-system pods - Health - Use keep_if instead of select -- Container log enrichment (turned OFF by default for TimeOfCommand, ContainerName & ContainerImage) +- Container log enrichment (turned OFF by default for ContainerName & ContainerImage) - Application Insights Telemetry - Async - Fix metricTime to be batch time for all metric input plugins - Close socket connections properly for DockerAPIClient From a0984af9984d04e6f8a364a778e9a8a412365ab4 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 4 Dec 2019 12:18:02 -0800 Subject: [PATCH 149/160] Merge from ci_feature_prod into ci_feature (fix put back timeofcommand) (#311) (#316) * Updatng release history * fixing the plugin logs for emit stream * updating log message * Remove Log Processing from fluentd configuration * Remove plugin references from base_container.data * Dilipr/fluent bit log processing (#126) * Build out_oms.so and include in docker-cimprov package * Adding fluent-bit-config file to base container * PR Feedback * Adding out_oms.conf to base_container.data * PR Feedback * Making the critical section as small as possible * PR Feedback * Fixing the newline bug for Computer, and changing containerId to Id * Dilipr/glide updates (#127) * Updating glide.* files to include lumberjack * containerID="" for pull issues * Using KubeAPI for getting image,name. Adding more logs (#129) * Using KubeAPI for getting image,name. Adding more logs * Moving log file and state file to within the omsagent container * Changing log and state paths * Dilipr/mark comments (#130) * Marks Comments + Error Handling * Drop records from files that are not in k8s format * Remove unnecessary log line' * Adding Log to the file that doesn't conform to the expected format * Rashmi/segfault latest (#132) * adding null checks in all providers * fixing type * fixing type * adding more null checks * update cjson * Adding a missed null check (#135) * reusing some variables (#136) * Rashmi/cjson delete null check (#138) * adding null check for cjson-delete * null chk * removing null check * updating log level to debug for some provider workflows (#139) * Fixing CPU Utilization and removing Fluent-bit filters (#140) Removing fluent-bit filters, CPU optimizations * Minor tweaks 1. Remove some logging 2. Added more Error Handling 3. Continue when there is an error with k8s api (#141) * Removing some logs, added more error checking, continue on kube-api error * Return FLB OK for json Marshall error, instead of RETRY * * Change FluentBit flush interval to 30 secs (from 5 secs) * Remove ContainerPerf, ContainerServiceLog,ContainerProcess (OMI workflows) for Daemonset * Container Log Telemetry * Fixing an issue with Send Init Event if Telemetry is not initialized properly, tab to whitespace in conf file * PR feedback * PR feedback * Sending an event every 5 mins(Heartbeat) (#146) * PR feedback to cleanup removed workflows * updating agent version for telemetry * updating agent version * Telemetry Updates (#149) * Telemetry Fixes 1. Added Log Generation Rate 2. Fixed parsing bugs 3. Added code to send Exceptions/errors * PR Feedback * Changes to send omsagent/omsagent-rs kubectl logs to App Insights (#159) * Changes to send omsagent/omsagent-rs kubectl logs to App Insights * PR Feedback * Rashmi/fluentd docker inventory (#160) * first stab * changes * changes * docker util changes * working tested util * input plugin and conf * changes * changes * changes * changes * changes * working containerinventory * fixing omi removal from container.conf * removing comments * file write and read * deleted containers working * changes * changes * socket timeout * deleting test files * adding log * fixing comment * appinsights changes * changes * tel changes * changes * changes * changes * changes * lib changes * changes * changes * fixes * PR comments * changes * updating the ownership * changes * changes * changes to container data * removing comment * changes * adding collection time * bug fix * env string truncation * changes for acs-engine test * Fix Telemetry Bug -- Initialize Telemetry Client after Initializing all required properties (#162) * Fix kube events memory leak due to yaml serialization for > 5k events (#163) * Setting Timeout for HTTP Client in PostDataHelper in outoms go plugin(#164) * Vishwa/perftelemetry 2 (#165) * add cpu usage telemetry for ds & rs * add cpu & memory usage telemetry for ds & rs * environment variable fix (#166) * environment variable fix * updating agent version * Fixing a bug where we were crashing due to container statuses not present when not was lost (#167) * Updating title * updating right versions for last release * Updating the break condition to look for end of response (#168) * Updating the break condition to look for end of response * changes for docker response * updating AgentVersion for telemetry * Updating readme for latest release changes * Changes - (#173) * use /var/log for state * new metric ContainerLogsAgentSideLatencyMs * new field 'timeOfComand' * Rashmi/kubenodeinventory (#174) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * Get cpuusage from usageseconds (#175) * Rashmi/kubenodeinventory (#176) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * Rashmi/kubenodeinventory (#178) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * Fixing an issue on the cpurate metric, which happens for the first time (when cache is empty) (#179) * Rashmi/kubenodeinventory (#180) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension * Exclude docker containers from container inventory (#181) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension * Excluding raw docker containers from container inventory * making labels key case insensitive * make poduid label case insensitive * Exclude pauseamd64 containers from container inventory (#182) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension * Excluding raw docker containers from container inventory * making labels key case insensitive * make poduid label case insensitive * changes to exclude pause amd 64 containers * Update agent version * Updating readme for the latest release * Fix indentation in kube.conf and update readme (#184) * containernodeinventory changes * changes for containernodeinventory * changes to add node telemetry * pod telemetry cahnges * updated telemetry changes * changes to get uid of owner references as controller id * updating socket to the new mount location * Adding exception telemetry and heartbeat * changes to fix controller type * Fixing typo * fixing method signature * updating plugins to get controller type from env * fixing bugs * changes to fixed type * removing comments * changes for fixed type * adding kubelet version as a dimension * Excluding raw docker containers from container inventory * making labels key case insensitive * make poduid label case insensitive * changes to exclude pause amd 64 containers * fixing indentation so that kube.conf contents can be used in config map in the yaml * updating readme to fix date and agent version * updating agent tag * Get Pods for current Node Only (#185) * Fix KubeAPI Calls to filter to get pods for current node * Reinstate log line * changes for container node inventory fixed type (#186) * Fix for mooncake (disable telemetry optionally) (#191) * disable telemetry option * fix a typo * CustomMetrics to ci_feature (#193) Custom Metrics changes to ci_feature * add ContainerNotRunning column to KubePodInventory * merge pr feedback: update name to ContainerStatusReason * Zero Fill for Missing Pod Phases, Change Namespace Dimension to Kubernetes namespace, as it might be confused with metrics namespace in Metrics Explorer (#194) * Zero Fill for Pod Counts by Phase * Change namespace dimension to Kubernetes namespace * No Retries for non 404 4xx errors (#196) * Update agent version for telemetry * Update readme for upcoming (ciprod01202019) release * fix readme formatting * fix formatting for readme * fix formatting for readme * fix readme * fix readme * fix agent version for telemetry * fix date in readme * update readme * Restart logs every 10MB instead of weekly (#198) * Rotate logs every 10MB instead of weekly * Removing some logging, fixed log rotation * update agent version for telemetry * update readme * Update kube.conf to use %STATE_DIR_WS% instead of hardcoded path * Fix AKSEngine Crash (#200) * hotfix * close resp.Body * remove chatty logs * membuf=5m and ignore files not updated since 5 mins * fix readme for new version * Fix the pod count in mdm agent plugin (#203) * Update readme * string freeze for out_mdm plugin * Vishwa/resourcecentric (#208) * resourceid fix (for AKS only) * fix name * Rashmi/win nodepool - PR (#206) * changes for win nodes enumeration * changes * changes * changes * node cpu metric rate changes * container cpu rate * changes * changes * changes * changes * changes * changes to include in_win_cadvisor_perf.rb file * send containerinventoryheartbeatevent * changes * cahnges for mdm metrics * changes * cahnges * changes * container states * changes * changes * changes for env variables * changes * changes * changes * changes * delete comments * changes * mutex changes * changes * changes * changes * telemetry fix for docker version * removing hardcoded values for mdm * update docker version * telemetry for windows cadvisor timeouts * exeception key update to computer * PR comments * adding os to container inventory for windows nodes (#210) * Fix omsagent crash Error when kube-api returns non-200, send events for HTTP Errors (#211) * Fix omsagent crash Error when kube-api returns non-200, send events for HTTP Errors * Fixing the bug, deferring telemetry changes for later * updating to lowercase compare for units (#212) * Merge from vishwa/telegraftcp to ci_feature for telegraf changes (#214) * merge from Vishwa/telegraf to Vishwa/telegraftcp for telegraf changes (#207) * add configuration for telegraf * fix for perms * fix telegraf config. * fix file location & config * update to config * fix namespace * trying different namespace and also debug=true * add placeholder for nodename * change namespace * updated config * fix uri * fix azMon settings * remove aad settings * add custom metrics regions * fix config * add support for replica-set config * fix oomkilled * Add telegraf 403 metric telemetry & non 403 trace telemetry * fix type * fix package * fix package import * fix filename * delete unused file * conf file for rs; fix 403counttotal metric for telegraf, remove host and use nodeName consistently, rename metrics * fix statefulsets * fix typo. * fix another typo. * fix telemetry * fix casing issue * fix comma issue. * disable telemetry for rs ; fix stateful set name * worksround for namespace fix * telegraf integration - v1 * telemetry changes for telegraf * telemetry & other changes * remove custom metric regions as we dont need anymore * remove un-needed files * fixes * exclude certain volumes and fix telemetry to not have computer & nodename as dimensions (redundant) * Vishwa/resourcecentric (#208) (#209) * resourceid fix (for AKS only) * fix name * near final metric shape * change from customlog to fixed type (InsightsMetrics) * fix PR feedback * fix pr feedback * Fix telemetry error for telegraf err count metric (#215) * Fix Unscheduled Pod bug, remove excess telemetry (#218) * Fix Unscheduled Pod bug, remove excess telemetry * Send Success Telemetry only once after startup for a node in a cluster for MDM Post * Sending telemetry for successful push to MDM every hour * Merge from Vishwa/promstandardmetrics into ci_feature (#220) * enable prometheus metrics collection in replica-set * fixing typos * fix config file path for replicaset * fix configuration * config changes * merge config/settings to ci_feature (#221) * updating fluentbit to use LOG_TAIL_PATH * changes * log exclusion pattern * changes * removing comments * adding enviornment varibale collection/disable * disable env var for cluster variable change * changes * toml parser changes * adding directory tomlrb * changes for container inventory * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * Telemetry for config overrides * add schema version telemetry * reduce the number of api calls for namespace filtering add more telemetry for config processing move liveness probe & parser to this repo * optimize for default kube-system namespace log collection exclusion * Fix Scenario when Controller name is empty (#222) * fix ; * ContainerLog collection optimizations (#223) * * derive k8s namespace from file (rather than making a api call) * optimize perf by not tailing excluded namespaces in stdout & stderr * Tuning fluentbit settings based on Cortana teams findings * making db sync off * buffer chunk and max as 1m so that we dont flush > 1m payloads * increasing rotatte wait from 5 secs to 30 secs * decreasing refresh interval from 60 secs to 30 secs * adding retry limit as 10 so that items get dropped in 50 secs rather than infinetely trying * changing flush to 5 secs from 30 secs * merge final changes for release from Vishwa/june2019agentrel to ci_feature (#224) * * derive k8s namespace from file (rather than making a api call) * optimize perf by not tailing excluded namespaces in stdout & stderr * Tuning fluentbit settings based on Cortana teams findings * making db sync off * buffer chunk and max as 1m so that we dont flush > 1m payloads * increasing rotatte wait from 5 secs to 30 secs * decreasing refresh interval from 60 secs to 30 secs * adding retry limit as 10 so that items get dropped in 50 secs rather than infinetely trying * changing flush to 5 secs from 30 secs * fix a minor comment * * change flush from 5 to 10 secs based on perf findings * fix fluent bit tuning for perf run (#226) * fix fluent bit tuning for perf run * stop collecting our own partition * fix merge issue * add release notes for june release in ci_feature branch * fix title * update * fix title * Trim spaces in AKS_REGION (#233) This is not an issue for normal AKS Monitoring Addon Onboarding. ONLY an issue for backdoor onboarding * Add Logs Size To Telemetry (#234) * Add Logs to telemetry * Using len instead of unsafe.Sizeof * Merge Vishwa/promcustommetrics to ci_feature (#237) * hard code config for UST CCP team * fix config * fix config after discussion * fix error log to get errros * fix config * update config * Add telemetry * Rashmi/promcustomconfig (#231) * changes * formatting changes * changes * changes * changes * changes * changes * changes * changes * changes * adding telemetry * changes * changes * changes * changes * changes * changes * changes * cahnges * changes * Rashmi/promcustomconfig (#236) * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * fix exceptions * changes to remove some exceptions * exception fixes * changes * changes for poduid nil check * Fix Region space error (#239) * Trim spaces in AKS_REGION This is not an issue for normal AKS Monitoring Addon Onboarding. ONLY an issue for backdoor onboarding * Fix out_mdm parsing error * Removing buffer chunk size and buffer max size from fluentbit conf (#240) * hard code config for UST CCP team * fix config * fix config after discussion * fix error log to get errros * fix config * update config * Add telemetry * Rashmi/promcustomconfig (#231) * changes * formatting changes * changes * changes * changes * changes * changes * changes * changes * changes * adding telemetry * changes * changes * changes * changes * changes * changes * changes * cahnges * changes * Rashmi/promcustomconfig (#236) * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * fix exceptions * changes to remove some exceptions * exception fixes * changes * changes for poduid nil check * removing buffer chunk size and buffer max size from fluentbit conf * changes (#243) * Collect container last state (#235) * updating the OMS agent to also collect container last state * changed a comment * git surrounded ContainerLastStatus code in a begin/rescue block * added a lot of error checking and logging * Rashmi/fix prom telemetry (#247) * fix prom telemetry * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * Merge Health Model work into ci_feature behind a feature flag Pending perf testing (#246) Merge Health to ci_feature * Fix Deserialization Bug (#249) * Fix the bug where capacity is not updated and cached value was being used (#251) * Fix the Capacity computation * fix node cpu and memory limits calculation * changes (#250) * Added new Custom Metrics Regions, fixed MDM plugin crash bug (#253) Added new regions, added handler for MDM plugin start * Add Missing Handlers (#254) * Added Missing Handlers * Return MultiEventStream.new instead of empty array (#256) * Added explicit require_relative to avoid loading errors (#258) * Adding explicit require_relative * Gangams/enable ai telemetry in mc (#252) * enable ai telemetry to configure different ikey and endpoint per cloud * Fixing null check out_mdm bug, tomlparser bug, exposing Replica Set service name as an ENV variable (#261) * Expose replica set service as an env variable * Fixing null check out_mdm bug, and tomlparser bug * Updating the env variable name to be more specific to health model * Changes for creating custom plugins with namespace settings for prometheus scraping (#262) * changes * changes * changes * changes * changes * changes * chnages * changes * telemetry changes * changes * Cherry-pick hotfix 09092019 to ci_feature (#265) * Gangams/add telemetry hybrid (#264) * add telemetry to detect the cloud, distro and kernel version * add null check since providerId optional * detect azurestack cloud * rename to KubernetesProviderID since ProviderID name already used in LA * capture workspaceCloud to the telemetry * trim the domain read from file * KubeMonAgentEvents changes to collect configuration events (#267) * changes * changes * changes * changes * changes * changes * env changes * changes * changes * changes * reverting * changes * cahnges * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * chnages * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * changes * Fix the Dupe Perf Data Issue from the DaemonSet (#266) * Dupe Perf Record Fix * PR for 1. Container Memory CPU monitor 2. Configuration for Node Conditions 3. Fixed Type Changes 4. Use Env variable, and health_forward (that handles network errors at init) 5. Unit Tests (#268) * init containers fix and other bug fixes (#269) * init container - KPI and kubeperf changes * changes * changes * changes * changes for empty array fix * changes * changes * pod inventory exception fix * nil check changes * changes * fixing typo * changes * changes * PR - feedback * remove comment * tag pass changes * changes * tagdrop changes * changes * changes * Send agg monitor signal on details change (#270) send when an agg monitor details change, but state did not change * bug fixes for error (#274) * Fix to use declaration and assignment instead of assignment (#275) * bug fixes for error * adding declaration to assignment * 1. Added telemetry (#277) 2. Configuration property changes 3. Bug fixes for a. unscheduled pods returning green 3b. Sometimes, the details hash of agg monitors are different because the order of elements inside the array is different, causing the records to be sent * Bug fix to remove unused variable (#281) * bug fixes for error * adding declaration to assignment * removing unused variable * Fix the WARN<->WARNING typo (#282) * Bug Fixes 1. telemetry send throwing exception if records not initialized 2. permissions error in on-prem clusters (#284) * Bug fixes 1. not writeable, telemetry error * Change to state_WS_dir * Fix Require relative revert (#287) * Bug Fixes for exceptions in telemetry, remove limit set check (#289) * Bug Fixes 10222019 * Initialize container_cpu_memory_records in fhmb * Added telemetry to investigate health exceptions * Set frozen_string_literal to true * Send event once per container when lookup is empty, or limit is an array * Unit Tests, Use RS and POD to determine workload * Fixed Node Condition Bug, added exception handling to return get_rs_owner_ref * Fix the bug where if a warning condition appears before fail condition, the node condition is reported as warning instead of fail. Also fix the node conditions state to consider unknown as a failure state (#292) * Fix for Nodes Aspect not showing up in draft cluster (#294) * Fix the issue where the health tree is inconsistent if a deployment is deleted (#295) * Rashmi/1 16 test (#297) * health deployment update * apps v1 changes for deployment * changes * changes to use relicasets and api groups * Fix duplicate records in container memory/cpu samples (#298) * Update MDM region list to include francecentral, japaneast and australiaeast * Update MDM region list to include francecentral, japaneast and australiaeast * Send telemetry when there is error in calculation of state in percentage aggregation, and send state as unknown (#300) * fix exceptions (#306) * Merge Branch morgan into ci_feature (#308) * Fixes : 1) Disable health (for time being) - in DS & RS 2) Disable MDM (for time being) - in DS & RS 3) Merge kubeperf into kubenode & kubepod 4) Made scheduling predictable for kubenode & kubepod 5) Enable containerlog enrichment fields (timeofcommand, containername & containerimage) as a configurable setting (default = true/ON) - Also add telemetry for it 6) Filter OUT type!=Normal events for k8s events 7) AppInsights telemetry async 8) Fix double calling bug in in_win_cadvisor_perf 9) Add connect timeout (20secs) & read timeout (40 secs) for all cadvisor api calls & also for all kubernetes api server calls 10) Fix batchTime for kubepods to be one before making api server call (rather than after making the call, which will make it fluctuate based on api server latency for the call) * fix setting issue for the new enrichcontainerlog setting * fix compilation issue * fix another compilation issue * fix emit issues * fix a nil issue * fix mising tag * * Fix all input plugins for scheduling issue * Merge kubeservices with kubepodinventory (reduce RS to API server by one more) * Remove Kubelogs (not used) * Fix liveness probe * Disable enrichment by default for container logs * Move to yajl json parser across the board for docker provier code * Remove unused files * fix removed files * fix timeofcommand and remove a duplicate entry for a health file. * Rashmi/http leak fixes (#301) * changes for http connection close * close socket in ensure * adding nil check * Rashmi/http leak fixes (#303) * changes for http connection close * close socket in ensure * adding nil check * adding missing end * use yajl for events & nodes parsing. * Rashmi/http leak fixes (#304) * changes for http connection close * close socket in ensure * adding nil check * Update MDM region list to include francecentral, japaneast and australiaeast * Update MDM region list to include francecentral, japaneast and australiaeast * adding missing end * Send telemetry when there is error in calculation of state in percentage aggregation, and send state as unknown (#300) * changes for chunking * telemetry changes * some fixes * bug fix * changing to have morgan changes only * add new line * use polltime for metrics and disable out_forward for health * enable mdm & health * few optimizations * do not remove time of command make kube.conf same as scale tested config * remove comments from container.conf * remove flush comment for ai telemetry * remove commented code lines * fix config * remove timeofcommand when enrichment==false * fix config * enable mdm filter * Rashmi/api chunk (#307) * changes * changes * refactor changes * changes * changes * changes * changes * node changes * changes * changes * changes * changes * adding open and read timeouts for api client * removing comments * updating chunk size * Update Readme * add back timeofcommand (#310) From deff7ace376c2265520e58fef0da6dfd26b9aa6d Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 6 Dec 2019 16:25:11 -0800 Subject: [PATCH 150/160] Adding new cpu and memory limits to readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 49c6d1fe4..007f92d92 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Fix retries, wait between retries, chunk size, thread counts to be consistent for all FluentD workflows - Back-off for containerlog enrichment K8S API calls - Add new regions (3) for Azure Monitor Custom metrics -- Increase the cpu & memory limits for replica-set to support larger clusters (nodes & pods) +- Increase the cpu(1 core) & memory(750Mi) limits for replica-set to support larger clusters (nodes & pods) - Move to Ubuntu 18.04 LTS - Support for Kubernetes 1.16 - Use ifconfig for detecting network connectivity issues From 4b1ef9c7123b3b52a7460cb30fc45f9b0c0244e1 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 6 Jan 2020 17:34:48 -0800 Subject: [PATCH 151/160] CAdvisor to use 10255/10250 based on env variable (#321) * CAdvisor secure port changes (#320) * cadvsior secure port changes * update to use secure/insecure port for cadvisor * telemetry changes * fix bug * bug fix * changes * Adding cadvisor uri log * switching defaults * update readme * changes --- README.md | 7 +++ installer/conf/telegraf.conf | 2 +- .../code/plugin/CAdvisorMetricsAPIClient.rb | 53 +++++++++++++++---- 3 files changed, 51 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 007f92d92..75b2d8665 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,13 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 01/07/2020 - +##### Version microsoft/oms:ciprod01072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01072020 +## Code change log +- Switch between 10255(old) and 10250(new) ports for cadvisor for older and newer versions of kubernetes +## Customer Impact +- Node cpu, node memory, container cpu and container memory metrics were obtained earlier by querying kubelet readonly port(http://$NODE_IP:10255). Agent now supports getting these metrics from kubelet port(https://$NODE_IP:10250) as well. During the agent startup, it checks for connectivity to kubelet port(https://$NODE_IP:10250), and if it fails the metrics source is defaulted to readonly port(http://$NODE_IP:10255). + ### 12/04/2019 - ##### Version microsoft/oms:ciprod12042019 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod12042019 - Fix scheduler for all input plugins diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index cd22a56b4..f9dc3fb6a 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -531,7 +531,7 @@ [[inputs.prometheus]] name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. - urls = ["http://$NODE_IP:10255/metrics"] + urls = ["$CADVISOR_METRICS_URL"] fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] metric_version = 2 diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index be61b8b8f..8b0105a6f 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true class CAdvisorMetricsAPIClient - require 'yajl/json_gem' + require "yajl/json_gem" require "logger" require "net/http" require "net/https" @@ -29,6 +29,8 @@ class CAdvisorMetricsAPIClient @dsPromFieldDropCount = ENV["TELEMETRY_DS_PROM_FIELDDROP_LENGTH"] @dsPromUrlCount = ENV["TELEMETRY_DS_PROM_URLS_LENGTH"] + @cAdvisorMetricsSecurePort = ENV["IS_SECURE_CADVISOR_PORT"] + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil @@ -63,13 +65,34 @@ def getSummaryStatsFromCAdvisor(winNode) response = nil @Log.info "Getting CAdvisor Uri" begin - cAdvisorUri = getCAdvisorUri(winNode) + cAdvisorSecurePort = false + # Check to see if omsagent needs to use 10255(insecure) port or 10250(secure) port + if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" + cAdvisorSecurePort = true + end + + cAdvisorUri = getCAdvisorUri(winNode, cAdvisorSecurePort) + bearerToken = File.read("/var/run/secrets/kubernetes.io/serviceaccount/token") + @Log.info "cAdvisorUri: #{cAdvisorUri}" + if !cAdvisorUri.nil? uri = URI.parse(cAdvisorUri) - Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40 ) do |http| - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" + if !!cAdvisorSecurePort == true + Net::HTTP.start(uri.host, uri.port, + :use_ssl => true, :open_timeout => 20, :read_timeout => 40, + :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", + :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end + else + Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end end end rescue => error @@ -81,9 +104,14 @@ def getSummaryStatsFromCAdvisor(winNode) return response end - def getCAdvisorUri(winNode) + def getCAdvisorUri(winNode, cAdvisorSecurePort) begin - defaultHost = "http://localhost:10255" + if !!cAdvisorSecurePort == true + defaultHost = "https://localhost:10250" + else + defaultHost = "http://localhost:10255" + end + relativeUri = "/stats/summary" if !winNode.nil? nodeIP = winNode["InternalIP"] @@ -92,7 +120,11 @@ def getCAdvisorUri(winNode) end if !nodeIP.nil? @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") - return "http://#{nodeIP}:10255" + relativeUri + if !!cAdvisorSecurePort == true + return "https://#{nodeIP}:10250" + relativeUri + else + return "http://#{nodeIP}:10255" + relativeUri + end else @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") if !winNode.nil? @@ -104,7 +136,7 @@ def getCAdvisorUri(winNode) end end - def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601 ) + def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems = [] begin cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) @@ -211,6 +243,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["PodName"] = podName telemetryProps["ContainerName"] = containerName telemetryProps["Computer"] = hostName + telemetryProps["CAdvisorIsSecure"] = @cAdvisorMetricsSecurePort #telemetry about log collections settings if (File.file?(@configMapMountPath)) telemetryProps["clustercustomsettings"] = true From 6dc93e8828800f68063423bd322ee3918d1412ef Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 6 Jan 2020 17:42:51 -0800 Subject: [PATCH 152/160] changing font for code change and customer impact --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 75b2d8665..1898bd17c 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,10 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ### 01/07/2020 - ##### Version microsoft/oms:ciprod01072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01072020 -## Code change log +##### Code change log - Switch between 10255(old) and 10250(new) ports for cadvisor for older and newer versions of kubernetes -## Customer Impact + +##### Customer Impact - Node cpu, node memory, container cpu and container memory metrics were obtained earlier by querying kubelet readonly port(http://$NODE_IP:10255). Agent now supports getting these metrics from kubelet port(https://$NODE_IP:10250) as well. During the agent startup, it checks for connectivity to kubelet port(https://$NODE_IP:10250), and if it fails the metrics source is defaulted to readonly port(http://$NODE_IP:10255). ### 12/04/2019 - From 044f13db72dfa7c3a63cc28a466a0b924745a7e0 Mon Sep 17 00:00:00 2001 From: ganga1980 Date: Thu, 23 Jan 2020 21:51:03 -0800 Subject: [PATCH 153/160] For ARO, stop collecting inventory of master and infra (#323) * filter out infra and master nodes inventory for aro * filterout pods info scheduled master and infra nodes * fix redundant KubernetesApiClient name * filter out events sourced from master and infra nodes * fix in kubeapi * add the comments * fix pr feedback * minor updates * fix pr feedback * encode special characters in query * some refactoring --- source/code/plugin/KubernetesApiClient.rb | 42 ++++++++++++++++++- source/code/plugin/filter_cadvisor2mdm.rb | 3 +- .../plugin/health/health_monitor_utils.rb | 9 ++-- source/code/plugin/in_kube_events.rb | 16 ++++--- source/code/plugin/in_kube_health.rb | 6 ++- source/code/plugin/in_kube_nodes.rb | 6 ++- source/code/plugin/in_kube_podinventory.rb | 7 ++++ 7 files changed, 74 insertions(+), 15 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index e52c77884..6f108ec92 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -18,6 +18,7 @@ class KubernetesApiClient @@ClusterName = nil @@ClusterId = nil @@IsNodeMaster = nil + @@IsAROV3Cluster = nil #@@IsValidRunningNode = nil #@@IsLinuxCluster = nil @@KubeSystemNamespace = "kube-system" @@ -152,6 +153,20 @@ def getClusterId return @@ClusterId end + def isAROV3Cluster + return @@IsAROV3Cluster if !@@IsAROV3Cluster.nil? + @@IsAROV3Cluster = false + begin + cluster = getClusterId + if !cluster.nil? && !cluster.empty? && cluster.downcase.include?("/microsoft.containerservice/openshiftmanagedclusters") + @@IsAROV3Cluster = true + end + rescue => error + @Log.warn("KubernetesApiClient::IsAROV3Cluster : IsAROV3Cluster failed #{error}") + end + return @@IsAROV3Cluster + end + def isNodeMaster return @@IsNodeMaster if !@@IsNodeMaster.nil? @@IsNodeMaster = false @@ -177,6 +192,22 @@ def isNodeMaster return @@IsNodeMaster end + def getNodesResourceUri(nodesResourceUri) + begin + # For ARO v3 cluster, filter out all other node roles other than compute + if IsAROV3Cluster + if !nodesResourceUri.nil? && !nodesResourceUri.index("?").nil? + nodesResourceUri = nodesResourceUri + "&labelSelector=node-role.kubernetes.io%2Fcompute%3Dtrue" + else + nodesResourceUri = nodesResourceUri + "labelSelector=node-role.kubernetes.io%2Fcompute%3Dtrue" + end + end + rescue => error + @Log.warn("getNodesResourceUri failed: #{error}") + end + return nodesResourceUri + end + #def isValidRunningNode # return @@IsValidRunningNode if !@@IsValidRunningNode.nil? # @@IsValidRunningNode = false @@ -240,7 +271,8 @@ def getPods(namespace) def getWindowsNodes winNodes = [] begin - nodeInventory = JSON.parse(getKubeResourceInfo("nodes").body) + resourceUri = getNodesResourceUri("nodes") + nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" # Resetting the windows node cache @@WinNodeArray.clear @@ -357,6 +389,14 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName podUid = pod["metadata"]["uid"] end + # For ARO, skip the pods scheduled on to master or infra nodes to ingest + if IsAROV3Cluster && !pod["spec"].nil? && !pod["spec"]["nodeName"].nil? && + ( pod["spec"]["nodeName"].downcase.start_with?("infra-") || + pod["spec"]["nodeName"].downcase.start_with?("master-") ) + next + end + + podContainers = [] if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? podContainers = podContainers + pod["spec"]["containers"] diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index f14a1369b..bc26532a5 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -145,7 +145,8 @@ def ensure_cpu_memory_capacity_set end begin - nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes?fieldSelector=metadata.name%3D#{@@hostName}").body) + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?fieldSelector=metadata.name%3D#{@@hostName}") + nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo(resourceUri).body) rescue Exception => e @log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index 13d1416b1..2b5bd85b5 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -161,7 +161,8 @@ def get_resource_subscription(pod_inventory, metric_name, metric_capacity) def get_cluster_cpu_memory_capacity(log, node_inventory: nil) begin if node_inventory.nil? - node_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes") + node_inventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo(resourceUri).body) end cluster_cpu_capacity = 0.0 cluster_memory_capacity = 0.0 @@ -207,7 +208,8 @@ def refresh_kubernetes_api_data(log, hostName, force: false) end begin - @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes") + @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo(resourceUri).body) if !hostName.nil? podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=spec.nodeName%3D#{hostName}").body) else @@ -272,7 +274,8 @@ def ensure_cpu_memory_capacity_set(log, cpu_capacity, memory_capacity, hostname) log.info "CPU and Memory Capacity Not set" begin - @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes") + @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo(resourceUri).body) rescue Exception => e log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index 6116cb62d..b405afde9 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -17,7 +17,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" - + # 30000 events account to approximately 5MB @EVENTS_CHUNK_SIZE = 30000 end @@ -101,6 +101,14 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim if !eventQueryState.empty? && eventQueryState.include?(eventId) next end + + nodeName = items["source"].key?("host") ? items["source"]["host"] : (OMS::Common.get_hostname) + # For ARO v3 cluster, drop the master and infra node sourced events to ingest + if KubernetesApiClient.isAROV3Cluster && !nodeName.nil? && !nodeName.empty? && + ( nodeName.downcase.start_with?("infra-") || nodeName.downcase.start_with?("master-") ) + next + end + record["ObjectKind"] = items["involvedObject"]["kind"] record["Namespace"] = items["involvedObject"]["namespace"] record["Name"] = items["involvedObject"]["name"] @@ -112,11 +120,7 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim record["FirstSeen"] = items["firstTimestamp"] record["LastSeen"] = items["lastTimestamp"] record["Count"] = items["count"] - if items["source"].key?("host") - record["Computer"] = items["source"]["host"] - else - record["Computer"] = (OMS::Common.get_hostname) - end + record["Computer"] = nodeName record["ClusterName"] = KubernetesApiClient.getClusterName record["ClusterId"] = KubernetesApiClient.getClusterId wrapper = { diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index 0eebf395b..c54545e04 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -85,7 +85,8 @@ def enumerate #HealthMonitorUtils.refresh_kubernetes_api_data(@@hmlog, nil) # we do this so that if the call fails, we get a response code/header etc. - node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes") + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes") + node_inventory_response = KubernetesApiClient.getKubeResourceInfo(resourceUri) node_inventory = Yajl::Parser.parse(StringIO.new(node_inventory_response.body)) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}") pod_inventory = Yajl::Parser.parse(StringIO.new(pod_inventory_response.body)) @@ -299,7 +300,8 @@ def process_node_condition_monitor(node_inventory) def initialize_inventory #this is required because there are other components, like the container cpu memory aggregator, that depends on the mapping being initialized - node_inventory_response = KubernetesApiClient.getKubeResourceInfo("nodes") + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes") + node_inventory_response = KubernetesApiClient.getKubeResourceInfo(resourceUri) node_inventory = Yajl::Parser.parse(StringIO.new(node_inventory_response.body)) pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}") pod_inventory = Yajl::Parser.parse(StringIO.new(pod_inventory_response.body)) diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index fa0994f43..706c3ad13 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -69,7 +69,9 @@ def enumerate # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken("nodes?limit=#{@NODES_CHUNK_SIZE}") + resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) + $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) parse_and_emit_records(nodeInventory, batchTime) @@ -79,7 +81,7 @@ def enumerate #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken("nodes?limit=#{@NODES_CHUNK_SIZE}&continue=#{continuationToken}") + continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) parse_and_emit_records(nodeInventory, batchTime) else diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 28b20bfc0..c709edbc2 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -265,6 +265,13 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86 record["Name"] = items["metadata"]["name"] podNameSpace = items["metadata"]["namespace"] + # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes + if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && + ( items["spec"]["nodeName"].downcase.start_with?("infra-") || + items["spec"]["nodeName"].downcase.start_with?("master-") ) + next + end + if podNameSpace.eql?("kube-system") && !items["metadata"].key?("ownerReferences") # The above case seems to be the only case where you have horizontal scaling of pods # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash From acc1d278279ff393dd528bda87306d21acfdb064 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 28 Jan 2020 15:55:51 -0800 Subject: [PATCH 154/160] MDM plugin support for large scale clusters (#324) * Batch Commit * WIP: Committing move logic from filter to input * WIP : MDM plugins for scale clusters * Bug fixes 1. cpu percentage 2. bytesize on array. Remove log line * Fixing metric value in cadvisor2mdm plugin * WIP to laptop * Working version with cadvisor changes * Fix Health cpu usage * Added uri for cadvisor failure --- installer/datafiles/base_container.data | 2 + .../code/plugin/CAdvisorMetricsAPIClient.rb | 141 +++++++------ source/code/plugin/filter_cadvisor2mdm.rb | 52 +++-- .../plugin/filter_cadvisor_health_node.rb | 4 +- source/code/plugin/filter_inventory2mdm.rb | 4 +- .../plugin/health/health_monitor_utils.rb | 40 +--- source/code/plugin/in_kube_podinventory.rb | 26 ++- source/code/plugin/kubelet_utils.rb | 23 +++ source/code/plugin/out_mdm.rb | 12 +- source/code/plugin/podinventory_to_mdm.rb | 190 ++++++++++++++++++ 10 files changed, 365 insertions(+), 129 deletions(-) create mode 100644 source/code/plugin/kubelet_utils.rb create mode 100644 source/code/plugin/podinventory_to_mdm.rb diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 60de5af18..f976454f9 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -35,6 +35,8 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/code/plugin/in_win_cadvisor_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root +/opt/microsoft/omsagent/plugin/podinventory_to_mdm.rb; source/code/plugin/podinventory_to_mdm.rb; 644; root; root +/opt/microsoft/omsagent/plugin/kubelet_utils.rb; source/code/plugin/kubelet_utils.rb; 644; root; root /opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/code/plugin/CustomMetricsUtils.rb; 644; root; root diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 8b0105a6f..54e7e5fd9 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -55,85 +55,58 @@ class CAdvisorMetricsAPIClient # Keeping track of containers so that can delete the container from the container cpu cache when the container is deleted # as a part of the cleanup routine @@winContainerIdCache = [] - + #cadvisor ports + @@CADVISOR_SECURE_PORT = "10250" + @@CADVISOR_NON_SECURE_PORT = "10255" def initialize end class << self def getSummaryStatsFromCAdvisor(winNode) - headers = {} - response = nil - @Log.info "Getting CAdvisor Uri" - begin - cAdvisorSecurePort = false - # Check to see if omsagent needs to use 10255(insecure) port or 10250(secure) port - if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" - cAdvisorSecurePort = true - end - - cAdvisorUri = getCAdvisorUri(winNode, cAdvisorSecurePort) - bearerToken = File.read("/var/run/secrets/kubernetes.io/serviceaccount/token") - @Log.info "cAdvisorUri: #{cAdvisorUri}" + relativeUri = "/stats/summary" + return getResponse(winNode, relativeUri) + end - if !cAdvisorUri.nil? - uri = URI.parse(cAdvisorUri) - if !!cAdvisorSecurePort == true - Net::HTTP.start(uri.host, uri.port, - :use_ssl => true, :open_timeout => 20, :read_timeout => 40, - :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", - :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" - end - else - Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40) do |http| - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" - end - end - end - rescue => error - @Log.warn("CAdvisor api request failed: #{error}") - telemetryProps = {} - telemetryProps["Computer"] = winNode["Hostname"] - ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) - end - return response + def getNodeCapacityFromCAdvisor(winNode: nil) + relativeUri = "/spec/" + return getResponse(winNode, relativeUri) end - def getCAdvisorUri(winNode, cAdvisorSecurePort) - begin + def getBaseCAdvisorUri(winNode) + cAdvisorSecurePort = isCAdvisorOnSecurePort() + if !!cAdvisorSecurePort == true - defaultHost = "https://localhost:10250" + defaultHost = "https://localhost:#{@@CADVISOR_SECURE_PORT}" else - defaultHost = "http://localhost:10255" + defaultHost = "http://localhost:#{@@CADVISOR_NON_SECURE_PORT}" end - relativeUri = "/stats/summary" if !winNode.nil? - nodeIP = winNode["InternalIP"] + nodeIP = winNode["InternalIP"] else - nodeIP = ENV["NODE_IP"] + nodeIP = ENV["NODE_IP"] end + if !nodeIP.nil? - @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") - if !!cAdvisorSecurePort == true - return "https://#{nodeIP}:10250" + relativeUri - else - return "http://#{nodeIP}:10255" + relativeUri - end + @Log.info("Using #{nodeIP} for CAdvisor Host") + if !!cAdvisorSecurePort == true + return "https://#{nodeIP}:#{@@CADVISOR_SECURE_PORT}" + else + return "http://#{nodeIP}:#{@@CADVISOR_NON_SECURE_PORT}" + end else - @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") - if !winNode.nil? - return nil - else - return defaultHost + relativeUri - end + @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost}") + if !winNode.nil? + return nil + else + return defaultHost + end end - end + end + + def getCAdvisorUri(winNode, relativeUri) + baseUri = getBaseCAdvisorUri(winNode) + return baseUri + relativeUri end def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) @@ -696,5 +669,51 @@ def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn, m end return metricItems end + + def getResponse(winNode, relativeUri) + response = nil + @Log.info "Getting CAdvisor Uri Response" + bearerToken = File.read("/var/run/secrets/kubernetes.io/serviceaccount/token") + begin + cAdvisorUri = getCAdvisorUri(winNode, relativeUri) + @Log.info "cAdvisorUri: #{cAdvisorUri}" + + if !cAdvisorUri.nil? + uri = URI.parse(cAdvisorUri) + if isCAdvisorOnSecurePort() + Net::HTTP.start(uri.host, uri.port, + :use_ssl => true, :open_timeout => 20, :read_timeout => 40, + :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", + :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end + else + Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end + end + end + rescue => error + @Log.warn("CAdvisor api request for #{cAdvisorUri} failed: #{error}") + telemetryProps = {} + telemetryProps["Computer"] = winNode["Hostname"] + ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) + end + return response + end + + def isCAdvisorOnSecurePort + cAdvisorSecurePort = false + # Check to see whether omsagent needs to use 10255(insecure) port or 10250(secure) port + if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" + cAdvisorSecurePort = true + end + return cAdvisorSecurePort + end end end diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index bc26532a5..45f0d9d6f 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -7,6 +7,7 @@ module Fluent require 'yajl/json_gem' require_relative 'oms_common' require_relative 'CustomMetricsUtils' + require_relative 'kubelet_utils' class CAdvisor2MdmFilter < Filter Fluent::Plugin.register_filter('filter_cadvisor2mdm', self) @@ -110,9 +111,10 @@ def filter(tag, time, record) metric_value = record['DataItems'][0]['Collections'][0]['Value'] if counter_name.downcase == @@cpu_usage_nano_cores metric_name = @@cpu_usage_milli_cores - metric_value = metric_value/1000000 + metric_value /= 1000000 #cadvisor record is in nanocores. Convert to mc + @log.info "Metric_value: #{metric_value} CPU Capacity #{@cpu_capacity}" if @cpu_capacity != 0.0 - percentage_metric_value = (metric_value*1000000)*100/@cpu_capacity + percentage_metric_value = (metric_value)*100/@cpu_capacity end end @@ -138,34 +140,42 @@ def filter(tag, time, record) def ensure_cpu_memory_capacity_set - @log.info "ensure_cpu_memory_capacity_set @cpu_capacity #{@cpu_capacity} @memory_capacity #{@memory_capacity}" if @cpu_capacity != 0.0 && @memory_capacity != 0.0 @log.info "CPU And Memory Capacity are already set" return end - begin + controller_type = ENV["CONTROLLER_TYPE"] + if controller_type.downcase == 'replicaset' + @log.info "ensure_cpu_memory_capacity_set @cpu_capacity #{@cpu_capacity} @memory_capacity #{@memory_capacity}" + + begin resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?fieldSelector=metadata.name%3D#{@@hostName}") nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo(resourceUri).body) - rescue Exception => e - @log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " - ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) - end - if !nodeInventory.nil? - cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") - if !cpu_capacity_json.nil? && !cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - @cpu_capacity = cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] - @log.info "CPU Limit #{@cpu_capacity}" - else - @log.info "Error getting cpu_capacity" + rescue Exception => e + @log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) end - memory_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes") - if !memory_capacity_json.nil? && !memory_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - @memory_capacity = memory_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] - @log.info "Memory Limit #{@memory_capacity}" - else - @log.info "Error getting memory_capacity" + if !nodeInventory.nil? + cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") + if !cpu_capacity_json.nil? && !cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + @cpu_capacity = cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] + @log.info "CPU Limit #{@cpu_capacity}" + else + @log.info "Error getting cpu_capacity" + end + memory_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes") + if !memory_capacity_json.nil? && !memory_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? + @memory_capacity = memory_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] + @log.info "Memory Limit #{@memory_capacity}" + else + @log.info "Error getting memory_capacity" + end end + elsif controller_type.downcase == 'daemonset' + capacity_from_kubelet = KubeletUtils.get_node_capacity + @cpu_capacity = capacity_from_kubelet[0] + @memory_capacity = capacity_from_kubelet[1] end end diff --git a/source/code/plugin/filter_cadvisor_health_node.rb b/source/code/plugin/filter_cadvisor_health_node.rb index c6280db60..4106b4d82 100644 --- a/source/code/plugin/filter_cadvisor_health_node.rb +++ b/source/code/plugin/filter_cadvisor_health_node.rb @@ -131,13 +131,13 @@ def process_node_cpu_record(record, metric_value) else instance_name = record['DataItems'][0]['InstanceName'] #@log.info "CPU capacity #{@cpu_capacity}" - + metric_value /= 1000000 percent = (metric_value.to_f/@cpu_capacity*100).round(2) #@log.debug "Percentage of CPU limit: #{percent}" state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(MonitorId::NODE_CPU_MONITOR_ID)) #@log.debug "Computed State : #{state}" timestamp = record['DataItems'][0]['Timestamp'] - health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value/1000000.to_f, "cpuUtilizationPercentage" => percent}} + health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value, "cpuUtilizationPercentage" => percent}} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) # temp = record.nil? ? "Nil" : record["MonitorInstanceId"] diff --git a/source/code/plugin/filter_inventory2mdm.rb b/source/code/plugin/filter_inventory2mdm.rb index 422b4b54a..16f2bb148 100644 --- a/source/code/plugin/filter_inventory2mdm.rb +++ b/source/code/plugin/filter_inventory2mdm.rb @@ -156,7 +156,7 @@ def process_pod_inventory_records(es) no_phase_dim_values_hash = Hash.new total_pod_count = 0 pod_count_by_phase = {} - podUids = {} + podUids = {} record_count = 0 begin records = [] @@ -165,7 +165,7 @@ def process_pod_inventory_records(es) timestamp = record['DataItems'][0]['CollectionTime'] podUid = record['DataItems'][0]['PodUid'] - if podUids.key?(podUid) + if podUids.key?(podUid) #@log.info "pod with #{podUid} already counted" next end diff --git a/source/code/plugin/health/health_monitor_utils.rb b/source/code/plugin/health/health_monitor_utils.rb index 2b5bd85b5..c23d8824a 100644 --- a/source/code/plugin/health/health_monitor_utils.rb +++ b/source/code/plugin/health/health_monitor_utils.rb @@ -3,6 +3,7 @@ require 'digest' require_relative 'health_model_constants' require 'yajl/json_gem' +require_relative '../kubelet_utils' module HealthModel # static class that provides a bunch of utility methods @@ -265,50 +266,13 @@ def get_monitor_instance_id(monitor_id, args = []) end def ensure_cpu_memory_capacity_set(log, cpu_capacity, memory_capacity, hostname) - log.info "ensure_cpu_memory_capacity_set cpu_capacity #{cpu_capacity} memory_capacity #{memory_capacity}" if cpu_capacity != 1.0 && memory_capacity != 1.0 log.info "CPU And Memory Capacity are already set" return [cpu_capacity, memory_capacity] end - log.info "CPU and Memory Capacity Not set" - begin - resourceUri = KubernetesApiClient.getNodesResourceUri("nodes") - @@nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo(resourceUri).body) - rescue Exception => e - log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " - ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) - end - if !@@nodeInventory.nil? - cpu_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") - if !cpu_capacity_json.nil? - cpu_capacity_json.each do |cpu_info_node| - if !cpu_info_node['DataItems'][0]['Host'].nil? && cpu_info_node['DataItems'][0]['Host'] == hostname - if !cpu_info_node['DataItems'][0]['Collections'][0]['Value'].nil? - cpu_capacity = cpu_info_node['DataItems'][0]['Collections'][0]['Value'] - end - end - end - log.info "CPU Limit #{cpu_capacity}" - else - log.info "Error getting cpu_capacity" - end - memory_capacity_json = KubernetesApiClient.parseNodeLimits(@@nodeInventory, "capacity", "memory", "memoryCapacityBytes") - if !memory_capacity_json.nil? - memory_capacity_json.each do |memory_info_node| - if !memory_info_node['DataItems'][0]['Host'].nil? && memory_info_node['DataItems'][0]['Host'] == hostname - if !memory_info_node['DataItems'][0]['Collections'][0]['Value'].nil? - memory_capacity = memory_info_node['DataItems'][0]['Collections'][0]['Value'] - end - end - end - log.info "memory Limit #{memory_capacity}" - else - log.info "Error getting memory_capacity" - end - return [cpu_capacity, memory_capacity] - end + return KubeletUtils.get_node_capacity end def build_metrics_hash(metrics_to_collect) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index c709edbc2..3a8ad2761 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -2,6 +2,9 @@ # frozen_string_literal: true module Fluent + + require_relative "podinventory_to_mdm" + class Kube_PodInventory_Input < Input Plugin.register_input("kubepodinventory", self) @@ -32,9 +35,12 @@ def initialize config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" + config_param :custom_metrics_azure_regions, :string + def configure(conf) super + @inventoryToMdmConvertor = Inventory2MdmConvertor.new(@custom_metrics_azure_regions) end def start @@ -87,7 +93,7 @@ def enumerate(podList = nil) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, batchTime) + parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -96,7 +102,7 @@ def enumerate(podList = nil) while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, batchTime) + parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -246,7 +252,7 @@ def getContainerEnvironmentVariables(pod, clusterCollectEnvironmentVar) end end - def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 @@ -466,6 +472,7 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86 "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper + @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) end end # Send container inventory records for containers on windows nodes @@ -483,7 +490,18 @@ def parse_and_emit_records(podInventory, serviceList, batchTime = Time.utc.iso86 end #podInventory block end router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubePodInventoryTag, eventStream) if eventStream + + if continuationToken.nil? #no more chunks in this batch to be sent, get all pod inventory records to send + @log.info "Sending pod inventory mdm records to out_mdm" + pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) + @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" + mdm_pod_inventory_es = MultiEventStream.new + pod_inventory_mdm_records.each {|pod_inventory_mdm_record| + mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record + } if pod_inventory_mdm_records + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es + end + #:optimize:kubeperf merge begin #if(!podInventory.empty?) diff --git a/source/code/plugin/kubelet_utils.rb b/source/code/plugin/kubelet_utils.rb new file mode 100644 index 000000000..6d97e30a9 --- /dev/null +++ b/source/code/plugin/kubelet_utils.rb @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative 'CAdvisorMetricsAPIClient' + +class KubeletUtils + class << self + def get_node_capacity + + cpu_capacity = 1.0 + memory_capacity = 1.0 + + response = CAdvisorMetricsAPIClient.getNodeCapacityFromCAdvisor(winNode: nil) + if !response.nil? && !response.body.nil? + cpu_capacity = JSON.parse(response.body)["num_cores"].nil? ? 1.0 : (JSON.parse(response.body)["num_cores"] * 1000.0) + memory_capacity = JSON.parse(response.body)["memory_capacity"].nil? ? 1.0 : JSON.parse(response.body)["memory_capacity"].to_f + $log.info "CPU = #{cpu_capacity}mc Memory = #{memory_capacity/1024/1024}MB" + return [cpu_capacity, memory_capacity] + end + end + end +end \ No newline at end of file diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 0a4e601b2..308eb6c68 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -22,6 +22,7 @@ def initialize @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" @@token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token" @@plugin_name = "AKSCustomMetricsMDM" + @@record_batch_size = 2600 @data_hash = {} @token_url = nil @@ -136,7 +137,14 @@ def write(chunk) chunk.msgpack_each { |(tag, record)| post_body.push(record.to_json) } - send_to_mdm post_body + # the limit of the payload is 1MB. Each record is ~300 bytes. using a batch size of 2600, so that + # the pay load size becomes approximately 800 Kb. + count = post_body.size + while count > 0 + current_batch = post_body.first(@@record_batch_size) + count -= current_batch.size + send_to_mdm current_batch + end else if !@can_send_data_to_mdm @log.info "Cannot send data to MDM since all required conditions were not met" @@ -157,7 +165,9 @@ def send_to_mdm(post_body) request = Net::HTTP::Post.new(@post_request_uri.request_uri) request["Content-Type"] = "application/x-ndjson" request["Authorization"] = "Bearer #{access_token}" + request.body = post_body.join("\n") + @log.info "REQUEST BODY SIZE #{request.body.bytesize/1024}" response = @http_client.request(request) response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code}" diff --git a/source/code/plugin/podinventory_to_mdm.rb b/source/code/plugin/podinventory_to_mdm.rb new file mode 100644 index 000000000..21ef12c34 --- /dev/null +++ b/source/code/plugin/podinventory_to_mdm.rb @@ -0,0 +1,190 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. + +# frozen_string_literal: true + +require 'logger' +require 'yajl/json_gem' +require_relative 'oms_common' +require_relative 'CustomMetricsUtils' + + +class Inventory2MdmConvertor + + @@node_count_metric_name = 'nodesCount' + @@pod_count_metric_name = 'podCount' + @@pod_inventory_tag = 'mdm.kubepodinventory' + @@node_inventory_tag = 'mdm.kubenodeinventory' + @@node_status_ready = 'Ready' + @@node_status_not_ready = 'NotReady' + + @@node_inventory_custom_metrics_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/nodes", + "dimNames": [ + "status" + ], + "series": [ + { + "dimValues": [ + "%{statusValue}" + ], + "min": %{node_status_count}, + "max": %{node_status_count}, + "sum": %{node_status_count}, + "count": 1 + } + ] + } + } + }' + + @@pod_inventory_custom_metrics_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/pods", + "dimNames": [ + "phase", + "Kubernetes namespace", + "node", + "controllerName" + ], + "series": [ + { + "dimValues": [ + "%{phaseDimValue}", + "%{namespaceDimValue}", + "%{nodeDimValue}", + "%{controllerNameDimValue}" + ], + "min": %{podCountMetricValue}, + "max": %{podCountMetricValue}, + "sum": %{podCountMetricValue}, + "count": 1 + } + ] + } + } + }' + + @@pod_phase_values = ['Running', 'Pending', 'Succeeded', 'Failed', 'Unknown'] + @process_incoming_stream = false + + def initialize(custom_metrics_azure_regions) + @log_path = '/var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log' + @log = Logger.new(@log_path, 1, 5000000) + @pod_count_hash = {} + @no_phase_dim_values_hash = {} + @pod_count_by_phase = {} + @pod_uids = {} + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(custom_metrics_azure_regions) + @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" + @log.debug {'Starting filter_inventory2mdm plugin'} + end + + def get_pod_inventory_mdm_records(batch_time) + begin + # generate all possible values of non_phase_dim_values X pod Phases and zero-fill the ones that are not already present + @no_phase_dim_values_hash.each {|key, value| + @@pod_phase_values.each{|phase| + pod_key = [key, phase].join('~~') + if !@pod_count_hash.key?(pod_key) + @pod_count_hash[pod_key] = 0 + #@log.info "Zero filled #{pod_key}" + else + next + end + } + } + records = [] + @pod_count_hash.each {|key, value| + key_elements = key.split('~~') + if key_elements.length != 4 + next + end + + # get dimension values by key + podNodeDimValue = key_elements[0] + podNamespaceDimValue = key_elements[1] + podControllerNameDimValue = key_elements[2] + podPhaseDimValue = key_elements[3] + + record = @@pod_inventory_custom_metrics_template % { + timestamp: batch_time, + metricName: @@pod_count_metric_name, + phaseDimValue: podPhaseDimValue, + namespaceDimValue: podNamespaceDimValue, + nodeDimValue: podNodeDimValue, + controllerNameDimValue: podControllerNameDimValue, + podCountMetricValue: value + } + records.push(JSON.parse(record)) + } + rescue Exception => e + @log.info "Error processing pod inventory record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [] + end + @log.info "Pod Count To Phase #{@pod_count_by_phase} " + @log.info "resetting convertor state " + @pod_count_hash = {} + @no_phase_dim_values_hash = {} + @pod_count_by_phase = {} + @pod_uids = {} + return records + end + + def process_pod_inventory_record(record) + if @process_incoming_stream + begin + records = [] + + podUid = record['DataItems'][0]['PodUid'] + if @pod_uids.key?(podUid) + #@log.info "pod with #{podUid} already counted" + return + end + + @pod_uids[podUid] = true + podPhaseDimValue = record['DataItems'][0]['PodStatus'] + podNamespaceDimValue = record['DataItems'][0]['Namespace'] + podControllerNameDimValue = record['DataItems'][0]['ControllerName'] + podNodeDimValue = record['DataItems'][0]['Computer'] + + if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? + podControllerNameDimValue = 'No Controller' + end + + if podNodeDimValue.empty? && podPhaseDimValue.downcase == 'pending' + podNodeDimValue = 'unscheduled' + elsif podNodeDimValue.empty? + podNodeDimValue = 'unknown' + end + + # group by distinct dimension values + pod_key = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue, podPhaseDimValue].join('~~') + + @pod_count_by_phase[podPhaseDimValue] = @pod_count_by_phase.key?(podPhaseDimValue) ? @pod_count_by_phase[podPhaseDimValue] + 1 : 1 + @pod_count_hash[pod_key] = @pod_count_hash.key?(pod_key) ? @pod_count_hash[pod_key] + 1 : 1 + + # Collect all possible combinations of dimension values other than pod phase + key_without_phase_dim_value = [podNodeDimValue, podNamespaceDimValue, podControllerNameDimValue].join('~~') + if @no_phase_dim_values_hash.key?(key_without_phase_dim_value) + return + else + @no_phase_dim_values_hash[key_without_phase_dim_value] = true + end + rescue Exception => e + @log.info "Error processing pod inventory record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + end + end + end +end + From 0ea6c6e05cd33efdd5c6c2b15fee5182a7827a25 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 28 Jan 2020 16:00:55 -0800 Subject: [PATCH 155/160] Add Null check for kube api responses in in_kube_health (#325) --- source/code/plugin/in_kube_health.rb | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/source/code/plugin/in_kube_health.rb b/source/code/plugin/in_kube_health.rb index c54545e04..f9b211f11 100644 --- a/source/code/plugin/in_kube_health.rb +++ b/source/code/plugin/in_kube_health.rb @@ -87,16 +87,24 @@ def enumerate # we do this so that if the call fails, we get a response code/header etc. resourceUri = KubernetesApiClient.getNodesResourceUri("nodes") node_inventory_response = KubernetesApiClient.getKubeResourceInfo(resourceUri) - node_inventory = Yajl::Parser.parse(StringIO.new(node_inventory_response.body)) + if !node_inventory_response.nil? && !node_inventory_response.body.nil? + node_inventory = Yajl::Parser.parse(StringIO.new(node_inventory_response.body)) + @resources.node_inventory = node_inventory + end + pod_inventory_response = KubernetesApiClient.getKubeResourceInfo("pods?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}") - pod_inventory = Yajl::Parser.parse(StringIO.new(pod_inventory_response.body)) + if !pod_inventory_response.nil? && !pod_inventory_response.body.nil? + pod_inventory = Yajl::Parser.parse(StringIO.new(pod_inventory_response.body)) + @resources.pod_inventory = pod_inventory + @resources.build_pod_uid_lookup + end + replicaset_inventory_response = KubernetesApiClient.getKubeResourceInfo("replicasets?fieldSelector=metadata.namespace%3D#{@@KubeInfraNamespace}", api_group: @@ApiGroupApps) - replicaset_inventory = Yajl::Parser.parse(StringIO.new(replicaset_inventory_response.body)) + if !replicaset_inventory_response.nil? && !replicaset_inventory_response.body.nil? + replicaset_inventory = Yajl::Parser.parse(StringIO.new(replicaset_inventory_response.body)) + @resources.set_replicaset_inventory(replicaset_inventory) + end - @resources.node_inventory = node_inventory - @resources.pod_inventory = pod_inventory - @resources.set_replicaset_inventory(replicaset_inventory) - @resources.build_pod_uid_lookup if node_inventory_response.code.to_i != 200 record = process_kube_api_up_monitor("fail", node_inventory_response) From 843100c23387d9ad15be5abad49205a67602a33c Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 4 Feb 2020 15:06:35 -0800 Subject: [PATCH 156/160] Fix casing bug (#326) --- source/code/plugin/KubernetesApiClient.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 6f108ec92..91b76bbf1 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -195,7 +195,7 @@ def isNodeMaster def getNodesResourceUri(nodesResourceUri) begin # For ARO v3 cluster, filter out all other node roles other than compute - if IsAROV3Cluster + if isAROV3Cluster() if !nodesResourceUri.nil? && !nodesResourceUri.index("?").nil? nodesResourceUri = nodesResourceUri + "&labelSelector=node-role.kubernetes.io%2Fcompute%3Dtrue" else @@ -390,7 +390,7 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName end # For ARO, skip the pods scheduled on to master or infra nodes to ingest - if IsAROV3Cluster && !pod["spec"].nil? && !pod["spec"]["nodeName"].nil? && + if isAROV3Cluster() && !pod["spec"].nil? && !pod["spec"]["nodeName"].nil? && ( pod["spec"]["nodeName"].downcase.start_with?("infra-") || pod["spec"]["nodeName"].downcase.start_with?("master-") ) next From 2c32e5797a6deb38beb275b4fdc8e1490efa0fcc Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 7 Feb 2020 09:51:48 -0800 Subject: [PATCH 157/160] Missed kube.conf update (#327) --- installer/conf/kube.conf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 207780442..77c8454a6 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -13,6 +13,7 @@ tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast #Kubernetes events @@ -47,7 +48,7 @@ log_level debug - + type filter_inventory2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast log_level info @@ -140,7 +141,7 @@ max_retry_wait 5m - + type out_oms log_level debug num_threads 5 From b10fee9b3ea1a899ae4ed7ac7b02171dce8ae04c Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 20 Feb 2020 16:49:48 -0800 Subject: [PATCH 158/160] changes to use msi if service principal does not exist (#328) changes to use msi if service principal does not exist (#328) --- installer/conf/container.conf | 2 +- installer/scripts/tomlparser.rb | 12 ++ source/code/plugin/in_kube_events.rb | 34 +++++- source/code/plugin/in_kube_podinventory.rb | 49 ++++---- source/code/plugin/out_mdm.rb | 125 +++++++++++++++------ 5 files changed, 164 insertions(+), 58 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 93c250fbb..0e088e7f7 100644 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -110,5 +110,5 @@ retry_limit 10 retry_wait 5s max_retry_wait 5m - retry_mdm_post_wait_minutes 60 + retry_mdm_post_wait_minutes 30 diff --git a/installer/scripts/tomlparser.rb b/installer/scripts/tomlparser.rb index ba67d023a..5f2596bca 100644 --- a/installer/scripts/tomlparser.rb +++ b/installer/scripts/tomlparser.rb @@ -16,6 +16,7 @@ @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false +@collectAllKubeEvents = false # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -128,6 +129,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level container log enrichment - #{errorStr}, using defaults, please check config map for errors") end + + #Get kube events enrichment setting + begin + if !parsedConfig[:log_collection_settings][:collect_all_kube_events].nil? && !parsedConfig[:log_collection_settings][:collect_all_kube_events][:enabled].nil? + @collectAllKubeEvents = parsedConfig[:log_collection_settings][:collect_all_kube_events][:enabled] + puts "config::Using config map setting for kube event collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") + end end end @@ -168,6 +179,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_COLLECT_ENV_VAR=#{@collectClusterEnvVariables}\n") file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") + file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " diff --git a/source/code/plugin/in_kube_events.rb b/source/code/plugin/in_kube_events.rb index b405afde9..bb0ab6f05 100644 --- a/source/code/plugin/in_kube_events.rb +++ b/source/code/plugin/in_kube_events.rb @@ -4,7 +4,6 @@ module Fluent class Kube_Event_Input < Input Plugin.register_input("kubeevents", self) - @@KubeEventsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml" def initialize @@ -20,6 +19,12 @@ def initialize # 30000 events account to approximately 5MB @EVENTS_CHUNK_SIZE = 30000 + + # Initializing events count for telemetry + @eventsCount = 0 + + # Initilize enable/disable normal event collection + @collectAllKubeEvents = false end config_param :run_interval, :time, :default => 60 @@ -35,6 +40,16 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) + collectAllKubeEventsSetting = ENV["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] + if !collectAllKubeEventsSetting.nil? && !collectAllKubeEventsSetting.empty? + if collectAllKubeEventsSetting.casecmp("false") == 0 + @collectAllKubeEvents = false + $log.warn("Normal kube events collection disabled for cluster") + else + @collectAllKubeEvents = true + $log.warn("Normal kube events collection enabled for cluster") + end + end end end @@ -55,11 +70,16 @@ def enumerate batchTime = currentTime.utc.iso8601 eventQueryState = getEventQueryState newEventQueryState = [] + @eventsCount = 0 # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}") + if @collectAllKubeEvents + continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?limit=#{@EVENTS_CHUNK_SIZE}") + else + continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}") + end $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) @@ -80,6 +100,13 @@ def enumerate # Setting this to nil so that we dont hold memory until GC kicks in eventList = nil writeEventQueryState(newEventQueryState) + + # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 + if (@eventsCount > 0) + telemetryProperties = {} + telemetryProperties["CollectAllKubeEvents"] = @collectAllKubeEvents + ApplicationInsightsUtility.sendMetricTelemetry("EventCount", @eventsCount, {}) + end rescue => errorStr $log.warn "in_kube_events::enumerate:Failed in enumerate: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -105,7 +132,7 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim nodeName = items["source"].key?("host") ? items["source"]["host"] : (OMS::Common.get_hostname) # For ARO v3 cluster, drop the master and infra node sourced events to ingest if KubernetesApiClient.isAROV3Cluster && !nodeName.nil? && !nodeName.empty? && - ( nodeName.downcase.start_with?("infra-") || nodeName.downcase.start_with?("master-") ) + (nodeName.downcase.start_with?("infra-") || nodeName.downcase.start_with?("master-")) next end @@ -129,6 +156,7 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper + @eventsCount += 1 end router.emit_stream(@tag, eventStream) if eventStream rescue => errorStr diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 3a8ad2761..3a78d4c05 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -2,7 +2,6 @@ # frozen_string_literal: true module Fluent - require_relative "podinventory_to_mdm" class Kube_PodInventory_Input < Input @@ -37,7 +36,6 @@ def initialize config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" config_param :custom_metrics_azure_regions, :string - def configure(conf) super @inventoryToMdmConvertor = Inventory2MdmConvertor.new(@custom_metrics_azure_regions) @@ -149,18 +147,25 @@ def populateWindowsContainerInventoryRecord(container, record, containerEnvVaria containerInventoryRecord["Computer"] = record["Computer"] containerInventoryRecord["ContainerHostname"] = record["Computer"] containerInventoryRecord["ElementName"] = containerName - image = container["image"] - repoInfo = image.split("/") - if !repoInfo.nil? - containerInventoryRecord["Repository"] = repoInfo[0] - if !repoInfo[1].nil? - imageInfo = repoInfo[1].split(":") - if !imageInfo.nil? - containerInventoryRecord["Image"] = imageInfo[0] - containerInventoryRecord["ImageTag"] = imageInfo[1] + + # Find delimiters in the string of format repository/image:imagetag + imageValue = container["image"] + if !imageValue.empty? + slashLocation = imageValue.index("/") + colonLocation = imageValue.index(":") + if !colonLocation.nil? + if slashLocation.nil? + # image:imagetag + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + else + # repository/image:imagetag + containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] + containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] end + containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] end end + imageIdInfo = container["imageID"] imageIdSplitInfo = imageIdInfo.split("@") if !imageIdSplitInfo.nil? @@ -273,8 +278,8 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && - ( items["spec"]["nodeName"].downcase.start_with?("infra-") || - items["spec"]["nodeName"].downcase.start_with?("master-") ) + (items["spec"]["nodeName"].downcase.start_with?("infra-") || + items["spec"]["nodeName"].downcase.start_with?("master-")) next end @@ -491,15 +496,15 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi router.emit_stream(@tag, eventStream) if eventStream - if continuationToken.nil? #no more chunks in this batch to be sent, get all pod inventory records to send - @log.info "Sending pod inventory mdm records to out_mdm" - pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) - @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" - mdm_pod_inventory_es = MultiEventStream.new - pod_inventory_mdm_records.each {|pod_inventory_mdm_record| - mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record - } if pod_inventory_mdm_records - router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es + if continuationToken.nil? #no more chunks in this batch to be sent, get all pod inventory records to send + @log.info "Sending pod inventory mdm records to out_mdm" + pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) + @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" + mdm_pod_inventory_es = MultiEventStream.new + pod_inventory_mdm_records.each { |pod_inventory_mdm_record| + mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record + } if pod_inventory_mdm_records + router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end #:optimize:kubeperf merge diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 308eb6c68..2f90b89ee 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -12,7 +12,7 @@ def initialize require "net/http" require "net/https" require "uri" - require 'yajl/json_gem' + require "yajl/json_gem" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" @@ -20,12 +20,19 @@ def initialize @@grant_type = "client_credentials" @@azure_json_path = "/etc/kubernetes/host/azure.json" @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" - @@token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token" + @@aad_token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token" + + # msiEndpoint is the well known endpoint for getting MSI authentications tokens + @@msi_endpoint_template = "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%{user_assigned_client_id}&resource=%{resource}" + @@userAssignedClientId = ENV["USER_ASSIGNED_IDENTITY_CLIENT_ID"] + @@plugin_name = "AKSCustomMetricsMDM" @@record_batch_size = 2600 + @@tokenRefreshBackoffInterval = 30 + @data_hash = {} - @token_url = nil + @parsed_token_uri = nil @http_client = nil @token_expiry_time = Time.now @cached_access_token = String.new @@ -33,6 +40,10 @@ def initialize @first_post_attempt_made = false @can_send_data_to_mdm = true @last_telemetry_sent_time = nil + # Setting useMsi to false by default + @useMsi = false + + @get_access_token_backoff_expiry = Time.now end def configure(conf) @@ -57,51 +68,102 @@ def start @log.info "Environment Variable AKS_REGION is not set.. " @can_send_data_to_mdm = false else - aks_region = aks_region.gsub(" ","") + aks_region = aks_region.gsub(" ", "") end if @can_send_data_to_mdm @log.info "MDM Metrics supported in #{aks_region} region" - @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} - @cached_access_token = get_access_token + @@post_request_url = @@post_request_url_template % {aks_region: aks_region, aks_resource_id: aks_resource_id} @post_request_uri = URI.parse(@@post_request_url) @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) @http_client.use_ssl = true @log.info "POST Request url: #{@@post_request_url}" ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) + + # Check to see if SP exists, if it does use SP. Else, use msi + sp_client_id = @data_hash["aadClientId"] + sp_client_secret = @data_hash["aadClientSecret"] + + if (!sp_client_id.nil? && !sp_client_id.empty? && sp_client_id != "msi") + @useMsi = false + aad_token_url = @@aad_token_url_template % {tenant_id: @data_hash["tenantId"]} + @parsed_token_uri = URI.parse(aad_token_url) + else + @useMsi = true + msi_endpoint = @@msi_endpoint_template % {user_assigned_client_id: @@userAssignedClientId, resource: @@token_resource_url} + @parsed_token_uri = URI.parse(msi_endpoint) + end + + @cached_access_token = get_access_token end rescue => e @log.info "exception when initializing out_mdm #{e}" ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "MDM"}) - @can_send_data_to_mdm = false return end - end - # get the access token only if the time to expiry is less than 5 minutes + # get the access token only if the time to expiry is less than 5 minutes and get_access_token_backoff has expired def get_access_token - if @cached_access_token.to_s.empty? || (Time.now + 5 * 60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration - @log.info "Refreshing access token for out_mdm plugin.." - token_uri = URI.parse(@token_url) - http_access_token = Net::HTTP.new(token_uri.host, token_uri.port) - http_access_token.use_ssl = true - token_request = Net::HTTP::Post.new(token_uri.request_uri) - token_request.set_form_data( - { - "grant_type" => @@grant_type, - "client_id" => @data_hash["aadClientId"], - "client_secret" => @data_hash["aadClientSecret"], - "resource" => @@token_resource_url, - } - ) - - token_response = http_access_token.request(token_request) - # Handle the case where the response is not 200 - parsed_json = JSON.parse(token_response.body) - @token_expiry_time = Time.now + 59 * 60 # set the expiry time to be ~one hour from current time - @cached_access_token = parsed_json["access_token"] + if (Time.now > @get_access_token_backoff_expiry) + http_access_token = nil + retries = 0 + begin + if @cached_access_token.to_s.empty? || (Time.now + 5 * 60 > @token_expiry_time) # Refresh token 5 minutes from expiration + @log.info "Refreshing access token for out_mdm plugin.." + + if (!!@useMsi) + @log.info "Using msi to get the token to post MDM data" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", {}) + @log.info "Opening TCP connection" + http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => false) + # http_access_token.use_ssl = false + token_request = Net::HTTP::Get.new(@parsed_token_uri.request_uri) + token_request["Metadata"] = true + else + @log.info "Using SP to get the token to post MDM data" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-SP", {}) + @log.info "Opening TCP connection" + http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => true) + # http_access_token.use_ssl = true + token_request = Net::HTTP::Post.new(@parsed_token_uri.request_uri) + token_request.set_form_data( + { + "grant_type" => @@grant_type, + "client_id" => @data_hash["aadClientId"], + "client_secret" => @data_hash["aadClientSecret"], + "resource" => @@token_resource_url, + } + ) + end + + @log.info "making request to get token.." + token_response = http_access_token.request(token_request) + # Handle the case where the response is not 200 + parsed_json = JSON.parse(token_response.body) + @token_expiry_time = Time.now + @@tokenRefreshBackoffInterval * 60 # set the expiry time to be ~thirty minutes from current time + @cached_access_token = parsed_json["access_token"] + @log.info "Successfully got access token" + end + rescue => err + @log.info "Exception in get_access_token: #{err}" + if (retries < 2) + retries += 1 + @log.info "Retrying request to get token - retry number: #{retries}" + sleep(retries) + retry + else + @get_access_token_backoff_expiry = Time.now + @@tokenRefreshBackoffInterval * 60 + @log.info "@get_access_token_backoff_expiry set to #{@get_access_token_backoff_expiry}" + ApplicationInsightsUtility.sendExceptionTelemetry(err, {"FeatureArea" => "MDM"}) + end + ensure + if http_access_token + @log.info "Closing http connection" + http_access_token.finish + end + end end @cached_access_token end @@ -172,10 +234,9 @@ def send_to_mdm(post_body) response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) - @last_telemetry_sent_time = Time.now + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) + @last_telemetry_sent_time = Time.now end - rescue Net::HTTPServerException => e @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) From f820075ef14d71751ad7702e011cbc44accff7c4 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 24 Feb 2020 15:41:37 -0800 Subject: [PATCH 159/160] Adding caseinsensitive compare (#330) Adding case insensitive compare --- source/code/plugin/out_mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 2f90b89ee..243251bca 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -85,7 +85,7 @@ def start sp_client_id = @data_hash["aadClientId"] sp_client_secret = @data_hash["aadClientSecret"] - if (!sp_client_id.nil? && !sp_client_id.empty? && sp_client_id != "msi") + if (!sp_client_id.nil? && !sp_client_id.empty? && sp_client_id.downcase != "msi") @useMsi = false aad_token_url = @@aad_token_url_template % {tenant_id: @data_hash["tenantId"]} @parsed_token_uri = URI.parse(aad_token_url) From 03d90dec9a293831bfbf2361f3f2ea699ba47482 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 25 Feb 2020 10:46:21 -0800 Subject: [PATCH 160/160] gpu monitoring (#329) * gpu monitoring * Emit info log for tests for the new insightsmetrics data stream --- installer/conf/container.conf | 14 ++ installer/conf/kube.conf | 15 ++ installer/datafiles/base_container.data | 2 +- .../code/plugin/CAdvisorMetricsAPIClient.rb | 96 +++++++++++++ source/code/plugin/KubernetesApiClient.rb | 131 ++++++++++++++++++ source/code/plugin/constants.rb | 15 ++ source/code/plugin/in_cadvisor_perf.rb | 33 ++++- source/code/plugin/in_kube_nodes.rb | 48 ++++++- source/code/plugin/in_kube_podinventory.rb | 37 ++++- source/code/plugin/in_win_cadvisor_perf.rb | 31 ++++- 10 files changed, 417 insertions(+), 5 deletions(-) create mode 100644 source/code/plugin/constants.rb diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 0e088e7f7..16acd6353 100644 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -112,3 +112,17 @@ max_retry_wait 5m retry_mdm_post_wait_minutes 30 + + + type out_oms + log_level debug + num_threads 5 + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer + buffer_queue_full_action drop_oldest_chunk + buffer_chunk_limit 4m + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 77c8454a6..98a2fbb63 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -216,4 +216,19 @@ retry_limit 10 retry_wait 5s max_retry_wait 5m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index f976454f9..e011dddf9 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -38,7 +38,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/podinventory_to_mdm.rb; source/code/plugin/podinventory_to_mdm.rb; 644; root; root /opt/microsoft/omsagent/plugin/kubelet_utils.rb; source/code/plugin/kubelet_utils.rb; 644; root; root /opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/code/plugin/CustomMetricsUtils.rb; 644; root; root - +/opt/microsoft/omsagent/plugin/constants.rb; source/code/plugin/constants.rb; 644; root; root /opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/code/plugin/ApplicationInsightsUtility.rb; 644; root; root /opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/code/plugin/ContainerInventoryState.rb; 644; root; root diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 54e7e5fd9..53139ea4e 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -13,6 +13,7 @@ class CAdvisorMetricsAPIClient require_relative "oms_common" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" + require_relative "constants" @configMapMountPath = "/etc/config/settings/log-data-collection-settings" @promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@ -255,6 +256,101 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met return metricItems end + def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) + metricDataItems = [] + begin + cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !cAdvisorStats.nil? + metricInfo = JSON.parse(cAdvisorStats.body) + end + if !winNode.nil? + hostName = winNode["Hostname"] + operatingSystem = "Windows" + else + if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil? + hostName = metricInfo["node"]["nodeName"] + else + hostName = (OMS::Common.get_hostname) + end + operatingSystem = "Linux" + end + if !metricInfo.nil? + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + else + @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") + end + rescue => error + @Log.warn("CAdvisorMetricsAPIClient::getInsightsMetrics failed: #{error}") + return metricDataItems + end + return metricDataItems + end + + def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + clusterName = KubernetesApiClient.getClusterName + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #gpu metrics + if (!container["accelerators"].nil?) + container["accelerators"].each do |accelerator| + if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings + containerName = container["name"] + metricValue = accelerator[metricNameToCollect] + + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace + + if (!accelerator["make"].nil? && !accelerator["make"].empty?) + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = accelerator["make"] + end + + if (!accelerator["model"].nil? && !accelerator["model"].empty?) + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_MODEL] = accelerator["model"] + end + + if (!accelerator["id"].nil? && !accelerator["id"].empty?) + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"] + end + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end + end + end + end + end + end + rescue => errorStr + @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + return metricItems + end + return metricItems + end + def clearDeletedWinContainersFromCache() begin winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 91b76bbf1..b864ef718 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -10,6 +10,7 @@ class KubernetesApiClient require "time" require_relative "oms_common" + require_relative "constants" @@ApiVersion = "v1" @@ApiVersionApps = "v1" @@ -470,6 +471,87 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName return metricItems end #getContainerResourceRequestAndLimits + def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItems = [] + begin + clusterId = getClusterId + clusterName = getClusterName + + metricInfo = metricJSON + metricInfo["items"].each do |pod| + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + if pod["metadata"]["annotations"].nil? + next + else + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] + end + else + podUid = pod["metadata"]["uid"] + end + + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end + + if (!podContainers.nil? && !podContainers.empty?) + if (!pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + else + nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU + end + podContainers.each do |container| + metricValue = nil + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + else + #No container level limit for the given metric, so default to node level limit for non-gpu metrics + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + metricValue = @@NodeMetrics[nodeMetricsHashKey] + end + end + if (!metricValue.nil?) + metricItem = {} + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = nodeName + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end + end + end + end + rescue => error + @Log.warn("getcontainerResourceRequestsAndLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + return metricItems + end + return metricItems + end #getContainerResourceRequestAndLimitsAsInsightsMetrics + def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin @@ -513,6 +595,51 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet return metricItems end #parseNodeLimits + def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItems = [] + begin + metricInfo = metricJSON + clusterId = getClusterId + clusterName = getClusterName + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + metricInfo["items"].each do |node| + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = node["metadata"]["name"] + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end + end + end + rescue => error + @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItems + end + def getMetricNumericValue(metricName, metricVal) metricValue = metricVal.downcase begin @@ -578,6 +705,10 @@ def getMetricNumericValue(metricName, metricVal) else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') metricValue = Float(metricValue) * 1000.0 ** 3 end + when "nvidia.com/gpu" + metricValue = Float(metricValue) * 1.0 + when "amd.com/gpu" + metricValue = Float(metricValue) * 1.0 else @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") metricValue = 0 diff --git a/source/code/plugin/constants.rb b/source/code/plugin/constants.rb new file mode 100644 index 000000000..20114ea2b --- /dev/null +++ b/source/code/plugin/constants.rb @@ -0,0 +1,15 @@ +class Constants + INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms" + INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId" + INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName" + INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor" + INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu" + INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel" + INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId" + INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName" + INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName" + INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" + INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" + INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" +end \ No newline at end of file diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index 96aa66aa1..a44365e9d 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -15,6 +15,7 @@ def initialize require_relative "CAdvisorMetricsAPIClient" require_relative "oms_common" require_relative "omslog" + require_relative "constants" end config_param :run_interval, :time, :default => 60 @@ -50,8 +51,10 @@ def enumerate() currentTime = Time.now time = currentTime.to_f batchTime = currentTime.utc.iso8601 + @@istestvar = ENV["ISTEST"] begin eventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) metricData.each do |record| record["DataType"] = "LINUX_PERF_BLOB" @@ -64,10 +67,38 @@ def enumerate() router.emit_stream(@containerhealthtag, eventStream) if eventStream router.emit_stream(@nodehealthtag, eventStream) if eventStream - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + + #start GPU InsightsMetrics items + begin + containerGPUusageInsightsMetricsDataItems = [] + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) + + + containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn "Failed when processing GPU Usage metrics in_cadvisor_perf : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #end GPU InsightsMetrics items + rescue => errorStr $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 706c3ad13..4242a8dba 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -31,6 +31,7 @@ def initialize require_relative "oms_common" require_relative "omslog" @NODES_CHUNK_SIZE = "400" + require_relative "constants" end config_param :run_interval, :time, :default => 60 @@ -105,6 +106,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) telemetrySent = false eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new + @@istestvar = ENV["ISTEST"] #get node inventory nodeInventory["items"].each do |items| record = {} @@ -193,6 +196,20 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) capacityInfo = items["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + begin + if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) + properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] + end + + if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) + properties["amdgpus"] = capacityInfo["amd.com/gpu"] + end + rescue => errorStr + $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #telemetry about prometheus metric collections settings for replicaset if (File.file?(@@promConfigMountPath)) properties["rsPromInt"] = @@rsPromInterval @@ -213,7 +230,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -237,6 +254,35 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end #end router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + + #start GPU InsightsMetrics items + begin + nodeGPUInsightsMetricsDataItems = [] + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime)) + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime)) + + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime)) + nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime)) + + nodeGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn "Failed when processing GPU metrics in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #end GPU InsightsMetrics items rescue => errorStr $log.warn "Failed in enumerate for KubePerf from in_kube_nodes : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 3a78d4c05..29438d076 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -24,6 +24,7 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" + require_relative "constants" @PODS_CHUNK_SIZE = "1500" @podCount = 0 @@ -262,6 +263,7 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new + @@istestvar = ENV["ISTEST"] begin #begin block start # Getting windows nodes from kubeapi @@ -518,6 +520,7 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory", "memoryLimitBytes", batchTime)) kubePerfEventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new containerMetricDataItems.each do |record| record["DataType"] = "LINUX_PERF_BLOB" @@ -526,6 +529,38 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi end #end router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + + begin + #start GPU InsightsMetrics items + + containerGPUInsightsMetricsDataItems = [] + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) + + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) + + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + #end GPU InsightsMetrics items + rescue => errorStr + $log.warn "Failed when processing GPU metrics in_kube_podinventory : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end rescue => errorStr $log.warn "Failed in parse_and_emit_record for KubePerf from in_kube_podinventory : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -567,7 +602,7 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi #Updating value for AppInsights telemetry @podCount += podInventory["items"].length - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb index 695a686cf..38868f2f5 100644 --- a/source/code/plugin/in_win_cadvisor_perf.rb +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -17,6 +17,7 @@ def initialize require_relative "KubernetesApiClient" require_relative "oms_common" require_relative "omslog" + require_relative "constants" end config_param :run_interval, :time, :default => 60 @@ -52,8 +53,10 @@ def enumerate() time = Time.now.to_f begin eventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 + @@istestvar = ENV["ISTEST"] #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() @@ -78,10 +81,36 @@ def enumerate() router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream - @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + + #start GPU InsightsMetrics items + begin + containerGPUusageInsightsMetricsDataItems = [] + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) + + containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + rescue => errorStr + $log.warn "Failed when processing GPU Usage metrics in_win_cadvisor_perf : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + #end GPU InsightsMetrics items + end # Cleanup routine to clear deleted containers from cache