diff --git a/.github/workflows/pr-checker.yml b/.github/workflows/pr-checker.yml index ec6e623b8..ab03f260f 100644 --- a/.github/workflows/pr-checker.yml +++ b/.github/workflows/pr-checker.yml @@ -56,7 +56,7 @@ jobs: format: 'table' severity: 'CRITICAL,HIGH' vuln-type: 'os,library' - skip-dirs: 'opt/telegraf,usr/sbin/telegraf' + skip-dirs: '/usr/sbin' exit-code: '1' timeout: '5m0s' WINDOWS-build: @@ -94,4 +94,3 @@ jobs: cd ./kubernetes/windows/ && docker build . --file Dockerfile -t $env:IMAGETAG --build-arg IMAGE_TAG=$env:IMAGETAG_TELEMETRY - name: List-docker-images run: docker images --digests --all - diff --git a/.github/workflows/run_unit_tests.yml b/.github/workflows/run_unit_tests.yml index 94ac4371a..435de91e8 100644 --- a/.github/workflows/run_unit_tests.yml +++ b/.github/workflows/run_unit_tests.yml @@ -26,7 +26,7 @@ jobs: uses: actions/checkout@v2 - name: install fluent run: | - sudo gem install fluentd -v "1.12.2" --no-document + sudo gem install fluentd -v "1.14.2" --no-document sudo fluentd --setup ./fluent - name: Run unit tests run: | diff --git a/ReleaseNotes.md b/ReleaseNotes.md index ebbf172e4..efef55e43 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,41 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 1/31/2022 - +##### Version microsoft/oms:ciprod01312022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022 (linux) +##### Version microsoft/oms:win-ciprod01312022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01312022 (windows) +##### Code change log +- Linux Agent + - Configurable DB name via configmap for ADX (default DB name:containerinsights) + - Default to cAdvisor port to 10250 and container runtime to Containerd + - Update AgentVersion annotation in yamls (omsagent and chart) with released MDSD agent version + - Incresing windows agent CPU limits from 200m to 500m + - Ignore new disk path that comes from containerd starting with k8s version >= 1.19.x, which was adding unnecessary InsightsMetrics logs and increasing cost + - Route the AI SDK logs to log file instead of stdout + - Telemetry to collect ContainerLog Records with empty Timestamp + - FluentBit version upgrade from 1.6.8 to 1.7.8 +- Windows Agent + - Update to use FluentBit for container log collection and removed FluentD dependency for container log collection + - Telemetry to track if any of the variable fields of windows container inventory records has field size >= 64KB + - Add windows os check in in_cadvisor_perf plugin to avoid making call in MDSD in MSI auth mode + - Bug fix for placeholder_hostname in telegraf metrics + - FluentBit version upgrade from 1.4.0 to 1.7.8 +- Common + - Upgrade FluentD gem version from 1.12.2 to 1.14.2 + - Upgrade Telegraf version from 1.18.0 to 1.20.3 + - Fix for exception in node allocatable + - Telemetry to track nodeCount & containerCount +- Other changes + - Updates to Arc K8s Extension ARM Onboarding templates with GA API version + - Added ARM Templates for MSI Based Onboarding for AKS + - Conformance test updates relates to sidecar container + - Troubelshooting script to detect issues related to Arc K8s Extension onboarding + - Remove the dependency SP for CDPX since configured to use MSI + - Linux Agent Image build improvements + - Update msys2 version to fix windows agent build + - Add explicit exit code 1 across all the PS scripts + + ### 10/13/2021 - ##### Version microsoft/oms:ciprod10132021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021 (linux) ##### Version microsoft/oms:win-ciprod10132021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10132021 (windows) diff --git a/build/linux/installer/conf/azm-containers-parser.conf b/build/common/installer/conf/azm-containers-parser.conf similarity index 100% rename from build/linux/installer/conf/azm-containers-parser.conf rename to build/common/installer/conf/azm-containers-parser.conf diff --git a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb index f29c87407..1d76007a4 100644 --- a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb @@ -3,6 +3,11 @@ @td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @td_agent_bit_conf_path = "/etc/fluent-bit/fluent-bit.conf" +end + @default_service_interval = "15" @default_mem_buf_limit = "10" @@ -20,14 +25,14 @@ def substituteFluentBitPlaceHolders bufferMaxSize = ENV["FBIT_TAIL_BUFFER_MAX_SIZE"] memBufLimit = ENV["FBIT_TAIL_MEM_BUF_LIMIT"] - serviceInterval = (!interval.nil? && is_number?(interval) && interval.to_i > 0 ) ? interval : @default_service_interval + serviceInterval = (!interval.nil? && is_number?(interval) && interval.to_i > 0) ? interval : @default_service_interval serviceIntervalSetting = "Flush " + serviceInterval tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize) && bufferChunkSize.to_i > 0) ? bufferChunkSize : nil tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize) && bufferMaxSize.to_i > 0) ? bufferMaxSize : nil - if ((!tailBufferChunkSize.nil? && tailBufferMaxSize.nil?) || (!tailBufferChunkSize.nil? && !tailBufferMaxSize.nil? && tailBufferChunkSize.to_i > tailBufferMaxSize.to_i)) + if ((!tailBufferChunkSize.nil? && tailBufferMaxSize.nil?) || (!tailBufferChunkSize.nil? && !tailBufferMaxSize.nil? && tailBufferChunkSize.to_i > tailBufferMaxSize.to_i)) puts "config:warn buffer max size must be greater or equal to chunk size" tailBufferMaxSize = tailBufferChunkSize end diff --git a/build/common/installer/scripts/tomlparser-agent-config.rb b/build/common/installer/scripts/tomlparser-agent-config.rb new file mode 100644 index 000000000..052bb5a5d --- /dev/null +++ b/build/common/installer/scripts/tomlparser-agent-config.rb @@ -0,0 +1,264 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + +require_relative "ConfigParseErrorLogger" + +@configMapMountPath = "/etc/config/settings/agent-settings" +@configSchemaVersion = "" +@enable_health_model = false + +# 250 Node items (15KB per node) account to approximately 4MB +@nodesChunkSize = 250 +# 1000 pods (10KB per pod) account to approximately 10MB +@podsChunkSize = 1000 +# 4000 events (1KB per event) account to approximately 4MB +@eventsChunkSize = 4000 +# roughly each deployment is 8k +# 500 deployments account to approximately 4MB +@deploymentsChunkSize = 500 +# roughly each HPA is 3k +# 2000 HPAs account to approximately 6-7MB +@hpaChunkSize = 2000 +# stream batch sizes to avoid large file writes +# too low will consume higher disk iops +@podsEmitStreamBatchSize = 200 +@nodesEmitStreamBatchSize = 100 + +# higher the chunk size rs pod memory consumption higher and lower api latency +# similarly lower the value, helps on the memory consumption but incurrs additional round trip latency +# these needs to be tuned be based on the workload +# nodes +@nodesChunkSizeMin = 100 +@nodesChunkSizeMax = 400 +# pods +@podsChunkSizeMin = 250 +@podsChunkSizeMax = 1500 +# events +@eventsChunkSizeMin = 2000 +@eventsChunkSizeMax = 10000 +# deployments +@deploymentsChunkSizeMin = 500 +@deploymentsChunkSizeMax = 1000 +# hpa +@hpaChunkSizeMin = 500 +@hpaChunkSizeMax = 2000 + +# emit stream sizes to prevent lower values which costs disk i/o +# max will be upto the chunk size +@podsEmitStreamBatchSizeMin = 50 +@nodesEmitStreamBatchSizeMin = 50 + +# configmap settings related fbit config +@fbitFlushIntervalSecs = 0 +@fbitTailBufferChunkSizeMBs = 0 +@fbitTailBufferMaxSizeMBs = 0 +@fbitTailMemBufLimitMBs = 0 + + +def is_number?(value) + true if Integer(value) rescue false +end + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for agent settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for agent settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for agent settings : #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? + if !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] + puts "enable_health_model = #{@enable_health_model}" + end + chunk_config = parsedConfig[:agent_settings][:chunk_config] + if !chunk_config.nil? + nodesChunkSize = chunk_config[:NODES_CHUNK_SIZE] + if !nodesChunkSize.nil? && is_number?(nodesChunkSize) && (@nodesChunkSizeMin..@nodesChunkSizeMax) === nodesChunkSize.to_i + @nodesChunkSize = nodesChunkSize.to_i + puts "Using config map value: NODES_CHUNK_SIZE = #{@nodesChunkSize}" + end + + podsChunkSize = chunk_config[:PODS_CHUNK_SIZE] + if !podsChunkSize.nil? && is_number?(podsChunkSize) && (@podsChunkSizeMin..@podsChunkSizeMax) === podsChunkSize.to_i + @podsChunkSize = podsChunkSize.to_i + puts "Using config map value: PODS_CHUNK_SIZE = #{@podsChunkSize}" + end + + eventsChunkSize = chunk_config[:EVENTS_CHUNK_SIZE] + if !eventsChunkSize.nil? && is_number?(eventsChunkSize) && (@eventsChunkSizeMin..@eventsChunkSizeMax) === eventsChunkSize.to_i + @eventsChunkSize = eventsChunkSize.to_i + puts "Using config map value: EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" + end + + deploymentsChunkSize = chunk_config[:DEPLOYMENTS_CHUNK_SIZE] + if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) && (@deploymentsChunkSizeMin..@deploymentsChunkSizeMax) === deploymentsChunkSize.to_i + @deploymentsChunkSize = deploymentsChunkSize.to_i + puts "Using config map value: DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" + end + + hpaChunkSize = chunk_config[:HPA_CHUNK_SIZE] + if !hpaChunkSize.nil? && is_number?(hpaChunkSize) && (@hpaChunkSizeMin..@hpaChunkSizeMax) === hpaChunkSize.to_i + @hpaChunkSize = hpaChunkSize.to_i + puts "Using config map value: HPA_CHUNK_SIZE = #{@hpaChunkSize}" + end + + podsEmitStreamBatchSize = chunk_config[:PODS_EMIT_STREAM_BATCH_SIZE] + if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) && + podsEmitStreamBatchSize.to_i <= @podsChunkSize && podsEmitStreamBatchSize.to_i >= @podsEmitStreamBatchSizeMin + @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i + puts "Using config map value: PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" + end + nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] + if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) && + nodesEmitStreamBatchSize.to_i <= @nodesChunkSize && nodesEmitStreamBatchSize.to_i >= @nodesEmitStreamBatchSizeMin + @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i + puts "Using config map value: NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" + end + end + # fbit config settings + fbit_config = parsedConfig[:agent_settings][:fbit_config] + if !fbit_config.nil? + fbitFlushIntervalSecs = fbit_config[:log_flush_interval_secs] + if !fbitFlushIntervalSecs.nil? && is_number?(fbitFlushIntervalSecs) && fbitFlushIntervalSecs.to_i > 0 + @fbitFlushIntervalSecs = fbitFlushIntervalSecs.to_i + puts "Using config map value: log_flush_interval_secs = #{@fbitFlushIntervalSecs}" + end + + fbitTailBufferChunkSizeMBs = fbit_config[:tail_buf_chunksize_megabytes] + if !fbitTailBufferChunkSizeMBs.nil? && is_number?(fbitTailBufferChunkSizeMBs) && fbitTailBufferChunkSizeMBs.to_i > 0 + @fbitTailBufferChunkSizeMBs = fbitTailBufferChunkSizeMBs.to_i + puts "Using config map value: tail_buf_chunksize_megabytes = #{@fbitTailBufferChunkSizeMBs}" + end + + fbitTailBufferMaxSizeMBs = fbit_config[:tail_buf_maxsize_megabytes] + if !fbitTailBufferMaxSizeMBs.nil? && is_number?(fbitTailBufferMaxSizeMBs) && fbitTailBufferMaxSizeMBs.to_i > 0 + if fbitTailBufferMaxSizeMBs.to_i >= @fbitTailBufferChunkSizeMBs + @fbitTailBufferMaxSizeMBs = fbitTailBufferMaxSizeMBs.to_i + puts "Using config map value: tail_buf_maxsize_megabytes = #{@fbitTailBufferMaxSizeMBs}" + else + # tail_buf_maxsize_megabytes has to be greater or equal to tail_buf_chunksize_megabytes + @fbitTailBufferMaxSizeMBs = @fbitTailBufferChunkSizeMBs + puts "config::warn: tail_buf_maxsize_megabytes must be greater or equal to value of tail_buf_chunksize_megabytes. Using tail_buf_maxsize_megabytes = #{@fbitTailBufferMaxSizeMBs} since provided config value not valid" + end + end + # in scenario - tail_buf_chunksize_megabytes provided but not tail_buf_maxsize_megabytes to prevent fbit crash + if @fbitTailBufferChunkSizeMBs > 0 && @fbitTailBufferMaxSizeMBs == 0 + @fbitTailBufferMaxSizeMBs = @fbitTailBufferChunkSizeMBs + puts "config::warn: since tail_buf_maxsize_megabytes not provided hence using tail_buf_maxsize_megabytes=#{@fbitTailBufferMaxSizeMBs} which is same as the value of tail_buf_chunksize_megabytes" + end + + fbitTailMemBufLimitMBs = fbit_config[:tail_mem_buf_limit_megabytes] + if !fbitTailMemBufLimitMBs.nil? && is_number?(fbitTailMemBufLimitMBs) && fbitTailMemBufLimitMBs.to_i > 0 + @fbitTailMemBufLimitMBs = fbitTailMemBufLimitMBs.to_i + puts "Using config map value: tail_mem_buf_limit_megabytes = #{@fbitTailMemBufLimitMBs}" + end + end + end + rescue => errorStr + puts "config::error:Exception while reading config settings for agent configuration setting - #{errorStr}, using defaults" + @enable_health_model = false + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end + @enable_health_model = false +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("agent_config_env_var", "w") + +if !file.nil? + file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") + file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") + file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") + file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") + file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") + file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") + file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") + file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") + # fbit settings + if @fbitFlushIntervalSecs > 0 + file.write("export FBIT_SERVICE_FLUSH_INTERVAL=#{@fbitFlushIntervalSecs}\n") + end + if @fbitTailBufferChunkSizeMBs > 0 + file.write("export FBIT_TAIL_BUFFER_CHUNK_SIZE=#{@fbitTailBufferChunkSizeMBs}\n") + end + if @fbitTailBufferMaxSizeMBs > 0 + file.write("export FBIT_TAIL_BUFFER_MAX_SIZE=#{@fbitTailBufferMaxSizeMBs}\n") + end + if @fbitTailMemBufLimitMBs > 0 + file.write("export FBIT_TAIL_MEM_BUF_LIMIT=#{@fbitTailMemBufLimitMBs}\n") + end + # Close file after writing all environment variables + file.close +else + puts "Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" +end + +def get_command_windows(env_variable_name, env_variable_value) + return "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Process\")" + "\n" + "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Machine\")" + "\n" +end + +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + # Write the settings to file, so that they can be set as environment variables + file = File.open("setagentenv.ps1", "w") + + if !file.nil? + if @fbitFlushIntervalSecs > 0 + commands = get_command_windows('FBIT_SERVICE_FLUSH_INTERVAL', @fbitFlushIntervalSecs) + file.write(commands) + end + if @fbitTailBufferChunkSizeMBs > 0 + commands = get_command_windows('FBIT_TAIL_BUFFER_CHUNK_SIZE', @fbitTailBufferChunkSizeMBs) + file.write(commands) + end + if @fbitTailBufferMaxSizeMBs > 0 + commands = get_command_windows('FBIT_TAIL_BUFFER_MAX_SIZE', @fbitTailBufferMaxSizeMBs) + file.write(commands) + end + if @fbitTailMemBufLimitMBs > 0 + commands = get_command_windows('FBIT_TAIL_MEM_BUF_LIMIT', @fbitTailMemBufLimitMBs) + file.write(commands) + end + # Close file after writing all environment variables + file.close + puts "****************End Config Processing********************" + else + puts "Exception while opening file for writing config environment variables for WINDOWS LOG" + puts "****************End Config Processing********************" + end +end \ No newline at end of file diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index b173ecfe3..b38a65984 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -26,8 +26,11 @@ @containerLogSchemaVersion = "" @collectAllKubeEvents = false @containerLogsRoute = "v2" # default for linux +@adxDatabaseName = "containerinsights" # default for all configurations if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 @containerLogsRoute = "v1" # default is v1 for windows until windows agent integrates windows ama + # This path format is necessary for fluent-bit in windows + @logTailPath = "C:\\var\\log\\containers\\*.log" end # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -141,8 +144,8 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level container log enrichment - #{errorStr}, using defaults, please check config map for errors") end - #Get container log schema version setting - begin + #Get container log schema version setting + begin if !parsedConfig[:log_collection_settings][:schema].nil? && !parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version].nil? @containerLogSchemaVersion = parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version] puts "config::Using config map setting for container log schema version" @@ -165,16 +168,31 @@ def populateSettingValuesFromConfigMap(parsedConfig) begin if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil? if !parsedConfig[:log_collection_settings][:route_container_logs][:version].empty? - @containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version] - puts "config::Using config map setting for container logs route: #{@containerLogsRoute}" - else - puts "config::Ignoring config map settings and using default value since provided container logs route value is empty" - end + @containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version] + puts "config::Using config map setting for container logs route: #{@containerLogsRoute}" + else + puts "config::Ignoring config map settings and using default value since provided container logs route value is empty" + end end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for container logs route - #{errorStr}, using defaults, please check config map for errors") end + #Get ADX database name setting + begin + if !parsedConfig[:log_collection_settings][:adx_database].nil? && !parsedConfig[:log_collection_settings][:adx_database][:name].nil? + if !parsedConfig[:log_collection_settings][:adx_database][:name].empty? + @adxDatabaseName = parsedConfig[:log_collection_settings][:adx_database][:name] + puts "config::Using config map setting for ADX database name : #{@adxDatabaseName}" + else + puts "config::Ignoring config map settings and using default value '#{@adxDatabaseName}' since provided adx database name value is empty" + end + else + puts "config::No ADX database name set, using default value : #{@adxDatabaseName}" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for adx database name - #{errorStr}, using default #{@adxDatabaseName}, please check config map for errors") + end end end @@ -218,6 +236,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") file.write("export AZMON_CONTAINER_LOG_SCHEMA_VERSION=#{@containerLogSchemaVersion}\n") + file.write("export AZMON_ADX_DATABASE_NAME=#{@adxDatabaseName}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " @@ -227,7 +246,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "****************End Config Processing********************" end - =begin This section generates the file that will set the environment variables for windows. This script will be called by the main.ps1 script which is the ENTRYPOINT script for the windows aks log container @@ -242,29 +260,31 @@ def get_command_windows(env_variable_name, env_variable_value) file = File.open("setenv.ps1", "w") if !file.nil? - commands = get_command_windows('AZMON_COLLECT_STDOUT_LOGS', @collectStdoutLogs) + commands = get_command_windows("AZMON_COLLECT_STDOUT_LOGS", @collectStdoutLogs) file.write(commands) - commands = get_command_windows('AZMON_LOG_TAIL_PATH', @logTailPath) + commands = get_command_windows("AZMON_LOG_TAIL_PATH", @logTailPath) file.write(commands) - commands = get_command_windows('AZMON_LOG_EXCLUSION_REGEX_PATTERN', @logExclusionRegexPattern) + commands = get_command_windows("AZMON_LOG_EXCLUSION_REGEX_PATTERN", @logExclusionRegexPattern) file.write(commands) - commands = get_command_windows('AZMON_STDOUT_EXCLUDED_NAMESPACES', @stdoutExcludeNamespaces) + commands = get_command_windows("AZMON_STDOUT_EXCLUDED_NAMESPACES", @stdoutExcludeNamespaces) file.write(commands) - commands = get_command_windows('AZMON_COLLECT_STDERR_LOGS', @collectStderrLogs) + commands = get_command_windows("AZMON_COLLECT_STDERR_LOGS", @collectStderrLogs) file.write(commands) - commands = get_command_windows('AZMON_STDERR_EXCLUDED_NAMESPACES', @stderrExcludeNamespaces) + commands = get_command_windows("AZMON_STDERR_EXCLUDED_NAMESPACES", @stderrExcludeNamespaces) file.write(commands) - commands = get_command_windows('AZMON_CLUSTER_COLLECT_ENV_VAR', @collectClusterEnvVariables) + commands = get_command_windows("AZMON_CLUSTER_COLLECT_ENV_VAR", @collectClusterEnvVariables) file.write(commands) - commands = get_command_windows('AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH', @excludePath) + commands = get_command_windows("AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH", @excludePath) file.write(commands) - commands = get_command_windows('AZMON_CLUSTER_CONTAINER_LOG_ENRICH', @enrichContainerLogs) + commands = get_command_windows("AZMON_CLUSTER_CONTAINER_LOG_ENRICH", @enrichContainerLogs) file.write(commands) - commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) + commands = get_command_windows("AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS", @collectAllKubeEvents) file.write(commands) - commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) + commands = get_command_windows("AZMON_CONTAINER_LOGS_ROUTE", @containerLogsRoute) file.write(commands) - commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion) + commands = get_command_windows("AZMON_CONTAINER_LOG_SCHEMA_VERSION", @containerLogSchemaVersion) + file.write(commands) + commands = get_command_windows("AZMON_ADX_DATABASE_NAME", @adxDatabaseName) file.write(commands) # Close file after writing all environment variables @@ -275,4 +295,4 @@ def get_command_windows(env_variable_name, env_variable_value) puts "Exception while opening file for writing config environment variables for WINDOWS LOG" puts "****************End Config Processing********************" end -end \ No newline at end of file +end diff --git a/build/linux/installer/conf/telegraf-prom-side-car.conf b/build/linux/installer/conf/telegraf-prom-side-car.conf index f5128d720..a94150fad 100644 --- a/build/linux/installer/conf/telegraf-prom-side-car.conf +++ b/build/linux/installer/conf/telegraf-prom-side-car.conf @@ -111,6 +111,26 @@ data_format = "json" namedrop = ["agent_telemetry", "file"] +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + +# ## Timeout for closing (default: 5s). +# # timeout = "5s" + +# ## Enable additional diagnostic logging. +# enable_diagnostic_logging = false + +# ## Context Tag Sources add Application Insights context tags to a tag value. +# ## +# ## For list of allowed context tag keys see: +# ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go +# # [outputs.application_insights.context_tag_sources] +# # "ai.cloud.role" = "kubernetes_container_name" +# # "ai.cloud.roleInstance" = "kubernetes_pod_name" +# namepass = ["t.azm.ms/agent_telemetry"] + #tagdrop = ["nodeName"] + ############################################################################### # PROCESSOR PLUGINS # ############################################################################### @@ -119,6 +139,20 @@ [processors.converter.fields] float = ["*"] +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### +# [[aggregators.quantile]] +# period = "30m" +# drop_original = true +# quantiles = [0.95] +# algorithm = "t-digest" +# compression = 100.0 +# namepass = ["agent_telemetry"] + +############################################################################### +# INPUT PLUGINS # +############################################################################### # Dummy plugin to test out toml parsing happens properly [[inputs.file]] interval = "24h" @@ -166,3 +200,20 @@ $AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER ## OSM Prometheus configuration $AZMON_TELEGRAF_OSM_PROM_PLUGINS + +# [[inputs.procstat]] +# name_prefix="t.azm.ms/" +# exe = "mdsd" +# interval = "60s" +# pid_finder = "native" +# pid_tag = true +# name_override = "agent_telemetry" +# fieldpass = ["cpu_usage", "memory_rss"] +# [inputs.procstat.tags] +# Computer = "$NODE_NAME" +# AgentVersion = "$AGENT_VERSION" +# ControllerType = "$CONTROLLER_TYPE" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACSResourceName = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" +# ContainerType = "$CONTAINER_TYPE" diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index 038b40bc2..72fc25451 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -124,6 +124,26 @@ namedrop = ["agent_telemetry", "file"] #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + +# ## Timeout for closing (default: 5s). +# # timeout = "5s" + +# ## Enable additional diagnostic logging. +# enable_diagnostic_logging = false + +# ## Context Tag Sources add Application Insights context tags to a tag value. +# ## +# ## For list of allowed context tag keys see: +# ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go +# # [outputs.application_insights.context_tag_sources] +# # "ai.cloud.role" = "kubernetes_container_name" +# # "ai.cloud.roleInstance" = "kubernetes_pod_name" +# namepass = ["t.azm.ms/agent_telemetry"] + #tagdrop = ["nodeName"] + ############################################################################### # PROCESSOR PLUGINS # ############################################################################### @@ -293,6 +313,13 @@ ############################################################################### # AGGREGATOR PLUGINS # ############################################################################### +# [[aggregators.quantile]] +# period = "30m" +# drop_original = true +# quantiles = [0.95] +# algorithm = "t-digest" +# compression = 100.0 +# namepass = ["agent_telemetry"] # # Keep the aggregate basicstats of each metric passing through. # [[aggregators.basicstats]] @@ -648,3 +675,18 @@ $AZMON_TELEGRAF_OSM_PROM_PLUGINS #[inputs.prometheus.tagpass] # operation_type = ["create_container", "remove_container", "pull_image"] +# [[inputs.procstat]] +# name_prefix="t.azm.ms/" +# exe = "mdsd" +# interval = "60s" +# pid_finder = "native" +# pid_tag = true +# name_override = "agent_telemetry" +# fieldpass = ["cpu_usage", "memory_rss"] +# [inputs.procstat.tags] +# Computer = "$NODE_NAME" +# AgentVersion = "$AGENT_VERSION" +# ControllerType = "$CONTROLLER_TYPE" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACSResourceName = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 0e4824e70..9f213e3e8 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -158,6 +158,26 @@ namepass = ["container.azm.ms/disk"] #fieldpass = ["used_percent"] +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + +# ## Timeout for closing (default: 5s). +# # timeout = "5s" + +# ## Enable additional diagnostic logging. +# enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + # namepass = ["agent_telemetry"] + #tagdrop = ["nodeName"] + ############################################################################### # PROCESSOR PLUGINS # ############################################################################### @@ -328,7 +348,13 @@ ############################################################################### # AGGREGATOR PLUGINS # ############################################################################### - +# [[aggregators.quantile]] +# period = "30m" +# drop_original = true +# quantiles = [0.95] +# algorithm = "t-digest" +# compression = 100.0 +# namepass = ["t.azm.ms/agent_telemetry"] # # Keep the aggregate basicstats of each metric passing through. # [[aggregators.basicstats]] # ## General Aggregator Arguments: @@ -407,7 +433,7 @@ # Dummy plugin to test out toml parsing happens properly [[inputs.file]] - interval = "24h" + interval = "24h" files = ["test.json"] data_format = "json" @@ -425,7 +451,7 @@ # Below due to Bug - https://github.com/influxdata/telegraf/issues/5615 # ORDER matters here!! - i.e the below should be the LAST modifier [inputs.disk.tagdrop] - path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers", "/etc/config/settings"] + path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers", "/etc/config/settings", "/run/host/containerd/io.containerd.runtime.v2.task/k8s.io/*"] # Read metrics about memory usage @@ -550,14 +576,14 @@ #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] # taginclude = ["nodeName"] -#[[inputs.procstat]] -# #name_prefix="t.azm.ms/" +# [[inputs.procstat]] +# name_prefix="t.azm.ms/" # exe = "mdsd" -# interval = "10s" +# interval = "60s" # pid_finder = "native" # pid_tag = true # name_override = "agent_telemetry" -# fieldpass = ["cpu_usage", "memory_rss", "memory_swap", "memory_vms", "memory_stack"] +# fieldpass = ["cpu_usage", "memory_rss"] # [inputs.procstat.tags] # Computer = "$NODE_NAME" # AgentVersion = "$AGENT_VERSION" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 4ed413028..9fc7ce08f 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -34,7 +34,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf; build/linux/installer/conf/td-agent-bit-prom-side-car.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/common/installer/conf/azm-containers-parser.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/test.json; build/linux/installer/conf/test.json; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root @@ -48,7 +48,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root -/opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root +/opt/tomlparser-agent-config.rb; build/common/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root @@ -286,6 +286,8 @@ chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log touch /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log chmod 666 /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log +touch /var/opt/microsoft/docker-cimprov/log/appinsights_error.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/appinsights_error.log touch /var/opt/microsoft/docker-cimprov/log/fluentd.log chmod 666 /var/opt/microsoft/docker-cimprov/log/fluentd.log diff --git a/build/windows/Makefile.ps1 b/build/windows/Makefile.ps1 index 9f3c438b0..52abbb071 100644 --- a/build/windows/Makefile.ps1 +++ b/build/windows/Makefile.ps1 @@ -13,21 +13,21 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } $builddir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $builddir + " ") if ($false -eq (Test-Path -Path $builddir)) { Write-Host("Invalid build dir : " + $builddir + " ") -ForegroundColor Red - exit + exit 1 } $versionFilePath = Join-Path -Path $builddir -child "version" Write-Host("versionFilePath : " + $versionFilePath + " ") if ($false -eq (Test-Path -Path $versionFilePath)) { Write-Host("Version file path incorrect or doesnt exist : " + $versionFilePath + " ") -ForegroundColor Red - exit + exit 1 } # read the version info @@ -36,7 +36,7 @@ foreach($line in Get-Content -Path $versionFilePath) { $parts = $line.split("=") if ($parts.length -lt 2 ) { Write-Host("Invalid content in version file : " + $versionFilePath + " ") -ForegroundColor Red - exit + exit 1 } switch ($parts[0]) { "CONTAINER_BUILDVERSION_MAJOR" { $BuildVersionMajor = $parts[1] } @@ -57,7 +57,7 @@ if ([string]::IsNullOrEmpty($BuildVersionMajor) -or [string]::IsNullOrEmpty($BuildVersionDate) -or [string]::IsNullOrEmpty($BuildVersionStatus)) { Write-Host("Expected version info doesnt exist in this version file : " + $versionFilePath + " ") -ForegroundColor Red - exit + exit 1 } # build version format will be [major].[minior].[patch]-[revision] $buildVersionString = $BuildVersionMajor + "." + $BuildVersionMinor + "." + $BuildVersionPatch + "-" + $BuildVersionBuildNR @@ -68,7 +68,7 @@ $certsrcdir = Join-Path -Path $builddir -ChildPath "windows\installer\certificat Write-Host("certsrc dir : " + $certsrcdir + " ") if ($false -eq (Test-Path -Path $certsrcdir)) { Write-Host("Invalid certificate generator source dir : " + $certsrcdir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host("set the cerificate generator source code directory : " + $certsrcdir + " ...") Set-Location -Path $certsrcdir @@ -100,13 +100,13 @@ Write-Host("Successfully published certificate generator code binaries") -Foregr $certreleasebinpath = Join-Path -PATH $certsrcdir -ChildPath "bin\Release\$dotnetcoreframework\win10-x64\publish\*.*" if ($false -eq (Test-Path -Path $certreleasebinpath)) { Write-Host("certificate release bin path doesnt exist : " + $certreleasebinpath + " ") -ForegroundColor Red - exit + exit 1 } $rootdir = Split-Path -Path $builddir if ($false -eq (Test-Path -Path $rootdir)) { Write-Host("Invalid docker provider root source dir : " + $rootdir + " ") -ForegroundColor Red - exit + exit 1 } $publishdir = Join-Path -Path $rootdir -ChildPath "kubernetes\windows\omsagentwindows" @@ -128,7 +128,7 @@ $outomsgoplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\go\src" Write-Host("Building Out_OMS go plugin code...") if ($false -eq (Test-Path -Path $outomsgoplugindir)) { Write-Host("Invalid Out oms go plugin code dir : " + $outomsgoplugindir + " ") -ForegroundColor Red - exit + exit 1 } Set-Location -Path $outomsgoplugindir @@ -178,7 +178,7 @@ if (Test-Path -Path $livenessprobeexepath){ Write-Host("livenessprobe.exe exists which indicates cpp build step succeeded") -ForegroundColor Green } else { Write-Host("livenessprobe.exe doesnt exist which indicates cpp build step failed") -ForegroundColor Red - exit + exit 1 } $installerdir = Join-Path -Path $builddir -ChildPath "common\installer" diff --git a/build/windows/installer/conf/fluent-bit.conf b/build/windows/installer/conf/fluent-bit.conf index 1eebe5fd6..c8f0eb373 100644 --- a/build/windows/installer/conf/fluent-bit.conf +++ b/build/windows/installer/conf/fluent-bit.conf @@ -1,16 +1,54 @@ [SERVICE] - Flush 15 - Daemon Off - Log_Level info - Log_File /etc/fluent-bit/fluent-bit.log + #Default service flush interval is 15 seconds + ${SERVICE_FLUSH_INTERVAL} + Daemon Off + storage.path /etc/fluent-bit/flbstore/ + storage.sync normal + storage.checksum off + storage.backlog.mem_limit 10M + Log_Level info + Parsers_File /etc/fluent-bit/azm-containers-parser.conf + Log_File /etc/fluent-bit/fluent-bit.log [INPUT] - Name forward - Listen 127.0.0.1 - Port 25230 - Mem_Buf_Limit 10m - Chunk_Size 32 - Buffer_Size 64 + Name tail + Tag oms.container.log.la.* + Path ${AZMON_LOG_TAIL_PATH} + Read_from_Head true + DB C:\\var\\log\\omsagent-fblogs.db + DB.Sync Off + Parser docker + ${TAIL_MEM_BUF_LIMIT} + ${TAIL_BUFFER_CHUNK_SIZE} + ${TAIL_BUFFER_MAX_SIZE} + Rotate_Wait 20 + Refresh_Interval 30 + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 5m + Exclude_Path ${AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH} + +[INPUT] + Name tail + Tag oms.container.log.flbplugin.* + Path C:\\var\\log\\containers\\omsagent*.log + Read_from_Head true + DB C:\\var\\log\\omsagent-fluentbit-containers.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 32 + Buffer_Size 64 + Mem_Buf_Limit 5m [INPUT] Name tcp diff --git a/build/windows/installer/conf/fluent.conf b/build/windows/installer/conf/fluent.conf index 741e5ce19..54a1c9cea 100644 --- a/build/windows/installer/conf/fluent.conf +++ b/build/windows/installer/conf/fluent.conf @@ -11,31 +11,6 @@ @log_level debug - - @type tail - path "#{ENV['AZMON_LOG_TAIL_PATH']}" - exclude_path "#{ENV['AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH']}" - pos_file /var/opt/microsoft/fluent/fluentd-containers.log.pos - tag oms.container.log.la - @log_level trace - path_key tailed_path - limit_recently_modified 5m - # if the container runtime is non docker then this will be updated to fluent-cri-parser.conf during container startup - @include fluent-docker-parser.conf - - - - @type tail - path /var/log/containers/omsagent*.log - pos_file /opt/microsoft/fluent/omsagent-fluentd-containers.log.pos - tag oms.container.log.flbplugin - @log_level trace - path_key tailed_path - read_from_head true - # if the container runtime is non docker then this will be updated to fluent-cri-parser.conf during container startup - @include fluent-docker-parser.conf - - #custom_metrics_mdm filter plugin @type cadvisor2mdm @@ -44,23 +19,6 @@ @log_level info - - @type grep - - key stream - pattern "#{ENV['AZMON_LOG_EXCLUSION_REGEX_PATTERN']}" - - - - - @type record_transformer - # fluent-plugin-record-modifier more light-weight but needs to be installed (dependency worth it?) - remove_keys tailed_path - - filepath ${record["tailed_path"]} - - - @type mdm @log_level debug @@ -73,33 +31,7 @@ retry_max_times 10 retry_wait 5s retry_max_interval 5m - flush_thread_count 5 + flush_thread_count 5 retry_mdm_post_wait_minutes 30 - - - @type forward - send_timeout 60s - recover_wait 10s - hard_timeout 60s - heartbeat_type none - ignore_network_errors_at_startup true - - name logaggregationserver - host 127.0.0.1 - port 25230 - weight 60 - - - - overflow_action throw_exception - chunk_limit_size 32k - queued_chunks_limit_size 256 - flush_interval 1 - flush_thread_interval 0.5 - flush_thread_burst_interval 0.01 - flush_thread_count 4 - retry_forever true - - diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 3ca313d38..0456eb625 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -21,8 +21,8 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod10132021" - tagWindows: "win-ciprod10132021" + tag: "ciprod01312022" + tagWindows: "win-ciprod01312022" pullPolicy: IfNotPresent dockerProviderVersion: "16.0.0-0" agentVersion: "1.10.0.1" @@ -178,7 +178,7 @@ omsagent: memory: 750Mi daemonsetwindows: limits: - cpu: 200m + cpu: 500m memory: 600Mi deployment: requests: diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh index 0451a038b..18d5eef5a 100644 --- a/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh @@ -35,7 +35,6 @@ pull_chart_from_source_mcr_to_push_to_dest_acr() { echo "-e error dest acr path must be provided " exit 1 fi - echo "Pulling chart from MCR:${srcMcrFullPath} ..." helm chart pull ${srcMcrFullPath} if [ $? -eq 0 ]; then diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index dff8223ad..00ee4628e 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -144,7 +144,6 @@ data: tcp_listener_chunk_size = 10 tcp_listener_buffer_size = 10 tcp_listener_mem_buf_limit = 200 - # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft. # They increase the maximum stdout/stderr log collection rate but will also cause higher cpu/memory usage. # [agent_settings.fbit_config] diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 90acb4959..f3a9efd7a 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,8 +2,6 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod10132021 -ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi ENV MALLOC_ARENA_MAX 2 @@ -18,6 +16,10 @@ ENV KUBE_CLIENT_BACKOFF_DURATION 0 ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ + +ARG IMAGE_TAG=ciprod01312022 +ENV AGENT_VERSION ${IMAGE_TAG} + WORKDIR ${tmpdir} # copy docker provider shell bundle to use the agent image @@ -27,3 +29,4 @@ COPY ./Linux_ULINUX_1.0_x64_64_Release/docker-cimprov-*.*.*-*.x86_64.sh . RUN chmod 775 $tmpdir/*.sh; sync; $tmpdir/setup.sh CMD [ "/opt/main.sh" ] + diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index a9184ab53..2834bf972 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -206,7 +206,6 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >> ~/.bashrc export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc - #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD export MDSD_ODS_COMPRESSION_LEVEL=0 echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >> ~/.bashrc @@ -425,19 +424,24 @@ fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" -#Defaults to use port 10255 -cAdvisorIsSecure=false -RET_CODE=`wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^ HTTP/{print $2}'` -if [ $RET_CODE -eq 200 ]; then - cAdvisorIsSecure=true +#Defaults to use secure port: 10250 +cAdvisorIsSecure=true +RET_CODE=$(wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^ HTTP/{print $2}') +if [ -z "$RET_CODE" ] || [ $RET_CODE -ne 200 ]; then + echo "Making wget request to cadvisor endpoint with port 10255 since failed with port 10250" + RET_CODE=$(wget --server-response http://$NODE_IP:10255/stats/summary 2>&1 | awk '/^ HTTP/{print $2}') + if [ ! -z "$RET_CODE" ] && [ $RET_CODE -eq 200 ]; then + cAdvisorIsSecure=false + fi fi -# default to docker since this is default in AKS as of now and change to containerd once this becomes default in AKS -export CONTAINER_RUNTIME="docker" +# default to containerd since this is common default in AKS and non-AKS +export CONTAINER_RUNTIME="containerd" export NODE_NAME="" + if [ "$cAdvisorIsSecure" = true ]; then - echo "Wget request using port 10250 succeeded. Using 10250" + echo "Using port 10250" export IS_SECURE_CADVISOR_PORT=true echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics" @@ -445,7 +449,7 @@ if [ "$cAdvisorIsSecure" = true ]; then echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') else - echo "Wget request using port 10250 failed. Using port 10255" + echo "Using port 10255" export IS_SECURE_CADVISOR_PORT=false echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics" @@ -460,10 +464,10 @@ if [ ! -z "$podWithValidContainerId" ]; then # convert to lower case so that everywhere else can be used in lowercase containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]") nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]") - # update runtime only if its not empty, not null and not startswith docker + # use default container runtime if obtained runtime value is either empty or null if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null" - elif [[ $containerRuntime != docker* ]]; then + else export CONTAINER_RUNTIME=$containerRuntime fi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 243677dd0..872ac99cf 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -30,10 +30,10 @@ sudo apt-get install jq=1.5+dfsg-2 -y #used to setcaps for ruby process to read /proc/env sudo apt-get install libcap2-bin -y -wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz -tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz +wget https://dl.influxdata.com/telegraf/releases/telegraf-1.20.3_linux_amd64.tar.gz +tar -zxvf telegraf-1.20.3_linux_amd64.tar.gz -mv /opt/telegraf-1.18.0/usr/bin/telegraf /opt/telegraf +mv /opt/telegraf-1.20.3/usr/bin/telegraf /opt/telegraf chmod 777 /opt/telegraf @@ -44,7 +44,7 @@ chmod 777 /opt/telegraf wget -qO - https://packages.fluentbit.io/fluentbit.key | sudo apt-key add - sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/apt/sources.list sudo apt-get update -sudo apt-get install td-agent-bit=1.6.8 -y +sudo apt-get install td-agent-bit=1.7.8 -y # install ruby2.6 sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F5DA5F09C3173AA6 @@ -52,7 +52,7 @@ sudo echo "deb http://ppa.launchpad.net/brightbox/ruby-ng/ubuntu bionic main" >> sudo apt-get update sudo apt-get install ruby2.6 ruby2.6-dev gcc make -y # fluentd v1 gem -gem install fluentd -v "1.12.2" --no-document +gem install fluentd -v "1.14.2" --no-document fluentd --setup ./fluent gem install gyoku iso8601 --no-doc @@ -61,6 +61,7 @@ rm -f $TMPDIR/docker-cimprov*.sh rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml rm -f $TMPDIR/envmdsd +rm -f $TMPDIR/telegraf-*.tar.gz # remove build dependencies sudo apt-get remove ruby2.6-dev gcc make -y diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 66f8c4010..28c8803c6 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -357,7 +357,7 @@ spec: component: oms-agent tier: node annotations: - agentVersion: "1.10.0.1" + agentVersion: "azure-mdsd-1.14.2" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022" imagePullPolicy: IfNotPresent resources: limits: @@ -454,7 +454,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode # - name: omsagent-prometheus - # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022" # imagePullPolicy: IfNotPresent # resources: # limits: @@ -596,14 +596,14 @@ spec: labels: rsName: "omsagent-rs" annotations: - agentVersion: "1.10.0.1" + agentVersion: "azure-mdsd-1.14.2" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022" imagePullPolicy: IfNotPresent resources: limits: @@ -765,7 +765,7 @@ spec: component: oms-agent-win tier: node-win annotations: - agentVersion: "1.10.0.1" + agentVersion: "0.0.0-0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -776,13 +776,19 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01312022" imagePullPolicy: IfNotPresent resources: limits: - cpu: 200m + cpu: 500m memory: 600Mi env: + - name: FBIT_SERVICE_FLUSH_INTERVAL + value: "15" + - name: FBIT_TAIL_BUFFER_CHUNK_SIZE + value: "1" + - name: FBIT_TAIL_BUFFER_MAX_SIZE + value: "1" # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 0ddf67ab2..8296da480 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,14 +3,14 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod10132021 +ARG IMAGE_TAG=win-ciprod01312022 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" # Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ -&& choco install -y msys2 --version 20210604.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y msys2 --version 20211130.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ && choco install -y vim # gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update @@ -20,7 +20,7 @@ RUN refreshenv \ && gem install cool.io -v 1.5.4 --platform ruby \ && gem install oj -v 3.3.10 \ && gem install json -v 2.2.0 \ -&& gem install fluentd -v 1.12.2 \ +&& gem install fluentd -v 1.14.2 \ && gem install win32-service -v 1.0.1 \ && gem install win32-ipc -v 0.7.0 \ && gem install win32-event -v 0.6.3 \ @@ -61,6 +61,7 @@ COPY ./omsagentwindows/installer/conf/fluent.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-cri-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit +COPY ./omsagentwindows/installer/conf/azm-containers-parser.conf /etc/fluent-bit/ COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows # copy telegraf conf file diff --git a/kubernetes/windows/Dockerfile-dev-base-image b/kubernetes/windows/Dockerfile-dev-base-image index 0081f9c53..501fead89 100644 --- a/kubernetes/windows/Dockerfile-dev-base-image +++ b/kubernetes/windows/Dockerfile-dev-base-image @@ -18,7 +18,7 @@ RUN refreshenv \ && gem install cool.io -v 1.5.4 --platform ruby \ && gem install oj -v 3.3.10 \ && gem install json -v 2.2.0 \ -&& gem install fluentd -v 1.12.2 \ +&& gem install fluentd -v 1.14.2 \ && gem install win32-service -v 1.0.1 \ && gem install win32-ipc -v 0.7.0 \ && gem install win32-event -v 0.6.3 \ diff --git a/kubernetes/windows/Dockerfile-dev-image b/kubernetes/windows/Dockerfile-dev-image index 35aa83bd9..c38889f7b 100644 --- a/kubernetes/windows/Dockerfile-dev-image +++ b/kubernetes/windows/Dockerfile-dev-image @@ -19,10 +19,8 @@ COPY ./omsagentwindows/out_oms.so /opt/omsagentwindows/out_oms.so # copy fluent, fluent-bit and out_oms conf files COPY ./omsagentwindows/installer/conf/fluent.conf /etc/fluent/ -# copy fluent docker and cri parser conf files -COPY ./omsagentwindows/installer/conf/fluent-cri-parser.conf /etc/fluent/ -COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit +COPY ./omsagentwindows/installer/conf/azm-containers-parser.conf /etc/fluent-bit/ COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows # copy telegraf conf file diff --git a/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 b/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 index 0fde7f379..b87132218 100644 --- a/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 +++ b/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 @@ -15,18 +15,18 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($image)) { Write-Host "Image parameter shouldnt be null or empty" -ForegroundColor Red - exit + exit 1 } $imageparts = $image.split(":") if (($imageparts.Length -ne 2)){ Write-Host "Image not in valid format. Expected format should be /:" -ForegroundColor Red - exit + exit 1 } $imagetag = $imageparts[1].ToLower() @@ -48,7 +48,7 @@ $dockerFileDir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $dockerFileDir + " ") if ($false -eq (Test-Path -Path $dockerFileDir)) { Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "changing directory to DockerFile dir: $dockerFileDir" diff --git a/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 b/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 index dbcfa6097..c1f655882 100644 --- a/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 +++ b/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 @@ -15,18 +15,18 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($image)) { Write-Host "Image parameter shouldnt be null or empty" -ForegroundColor Red - exit + exit 1 } $imageparts = $image.split(":") if (($imageparts.Length -ne 2)){ Write-Host "Image not in valid format. Expected format should be /:" -ForegroundColor Red - exit + exit 1 } $imagetag = $imageparts[1].ToLower() @@ -48,7 +48,7 @@ $dockerFileDir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $dockerFileDir + " ") if ($false -eq (Test-Path -Path $dockerFileDir)) { Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "changing directory to DockerFile dir: $dockerFileDir" diff --git a/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 b/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 index 142e20c3f..4b17239d2 100644 --- a/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 +++ b/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 @@ -1,6 +1,6 @@ <# .DESCRIPTION - Builds the Docker Image locally for the server core ltsc base and installs dependencies + Builds the Docker Image locally for the server core ltsc base and installs dependencies #> @@ -9,7 +9,7 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "start:Building the cert generator and out oms code via Makefile.ps1" @@ -20,7 +20,7 @@ $dockerFileDir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $dockerFileDir + " ") if ($false -eq (Test-Path -Path $dockerFileDir)) { Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "changing directory to DockerFile dir: $dockerFileDir" diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 7f41c860f..106e94c0f 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -52,23 +52,29 @@ function Set-EnvironmentVariables { if ($domain -eq "opinsights.azure.com") { $cloud_environment = "azurepubliccloud" $mcs_endpoint = "monitor.azure.com" - } elseif ($domain -eq "opinsights.azure.cn") { + } + elseif ($domain -eq "opinsights.azure.cn") { $cloud_environment = "azurechinacloud" $mcs_endpoint = "monitor.azure.cn" - } elseif ($domain -eq "opinsights.azure.us") { + } + elseif ($domain -eq "opinsights.azure.us") { $cloud_environment = "azureusgovernmentcloud" $mcs_endpoint = "monitor.azure.us" - } elseif ($domain -eq "opinsights.azure.eaglex.ic.gov") { + } + elseif ($domain -eq "opinsights.azure.eaglex.ic.gov") { $cloud_environment = "usnat" $mcs_endpoint = "monitor.azure.eaglex.ic.gov" - } elseif ($domain -eq "opinsights.azure.microsoft.scloud") { + } + elseif ($domain -eq "opinsights.azure.microsoft.scloud") { $cloud_environment = "ussec" $mcs_endpoint = "monitor.azure.microsoft.scloud" - } else { + } + else { Write-Host "Invalid or Unsupported domain name $($domain). EXITING....." exit 1 } - } else { + } + else { Write-Host "Domain name either null or empty. EXITING....." exit 1 } @@ -297,6 +303,13 @@ function Set-EnvironmentVariables { ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb .\setenv.ps1 + #Parse the configmap to set the right environment variables for agent config. + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-agent-config.rb + .\setagentenv.ps1 + + #Replace placeholders in fluent-bit.conf + ruby /opt/omsagentwindows/scripts/ruby/td-agent-bit-conf-customizer.rb + # run mdm config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser-mdm-metrics-config.rb .\setmdmenv.ps1 @@ -417,7 +430,9 @@ function Get-ContainerRuntime { function Start-Fluent-Telegraf { - # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service and telegraf service. + $containerRuntime = Get-ContainerRuntime + + # Run fluent-bit service first so that we do not miss any logs being forwarded by the telegraf service. # Run fluent-bit as a background job. Switch this to a windows service once fluent-bit supports natively running as a windows service Start-Job -ScriptBlock { Start-Process -NoNewWindow -FilePath "C:\opt\fluent-bit\bin\fluent-bit.exe" -ArgumentList @("-c", "C:\etc\fluent-bit\fluent-bit.conf", "-e", "C:\opt\omsagentwindows\out_oms.so") } @@ -428,7 +443,7 @@ function Start-Fluent-Telegraf { if (![string]::IsNullOrEmpty($containerRuntime) -and [string]$containerRuntime.StartsWith('docker') -eq $false) { # change parser from docker to cri if the container runtime is not docker Write-Host "changing parser from Docker to CRI since container runtime : $($containerRuntime) and which is non-docker" - (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf', 'fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf + (Get-Content -Path C:/etc/fluent-bit/fluent-bit.conf -Raw) -replace 'docker', 'cri' | Set-Content C:/etc/fluent-bit/fluent-bit.conf } # Start telegraf only in sidecar scraping mode @@ -482,6 +497,11 @@ function Start-Telegraf { Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" } + $hostName = [System.Environment]::GetEnvironmentVariable("HOSTNAME", "process") + Write-Host "nodename: $($hostName)" + Write-Host "replacing nodename in telegraf config" + (Get-Content "C:\etc\telegraf\telegraf.conf").replace('placeholder_hostname', $hostName) | Set-Content "C:\etc\telegraf\telegraf.conf" + Write-Host "Installing telegraf service" C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" @@ -581,10 +601,12 @@ if (![string]::IsNullOrEmpty($requiresCertBootstrap) -and ` $isAADMSIAuth = [System.Environment]::GetEnvironmentVariable("USING_AAD_MSI_AUTH") if (![string]::IsNullOrEmpty($isAADMSIAuth) -and $isAADMSIAuth.ToLower() -eq 'true') { Write-Host "skipping agent onboarding via cert since AAD MSI Auth configured" -} else { +} +else { Generate-Certificates Test-CertificatePath } + Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index 3e47b7eb2..857f9f690 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -1,8 +1,3 @@ -# -################# Dangerous to use appveyor links - the builds are removed after 6 months -# -#ARG FLUENTBIT_URL=https://ci.appveyor.com/api/buildjobs/37lho3xf8j5i6crj/artifacts/build%2Ftd-agent-bit-1.4.0-win64.zip - Write-Host ('Creating folder structure') New-Item -Type Directory -Path /installation -ErrorAction SilentlyContinue @@ -21,7 +16,7 @@ Write-Host ('Creating folder structure') Write-Host ('Installing Fluent Bit'); try { - $fluentBitUri='https://github.com/microsoft/OMS-docker/releases/download/winakslogagent/td-agent-bit-1.4.0-win64.zip' + $fluentBitUri='https://fluentbit.io/releases/1.7/td-agent-bit-1.7.8-win64.zip' Invoke-WebRequest -Uri $fluentBitUri -OutFile /installation/td-agent-bit.zip Expand-Archive -Path /installation/td-agent-bit.zip -Destination /installation/fluent-bit Move-Item -Path /installation/fluent-bit/*/* -Destination /opt/fluent-bit/ -ErrorAction SilentlyContinue @@ -36,7 +31,7 @@ Write-Host ('Finished Installing Fluentbit') Write-Host ('Installing Telegraf'); try { - $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_windows_amd64.zip' + $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.20.3_windows_amd64.zip' Invoke-WebRequest -Uri $telegrafUri -OutFile /installation/telegraf.zip Expand-Archive -Path /installation/telegraf.zip -Destination /installation/telegraf Move-Item -Path /installation/telegraf/*/* -Destination /opt/telegraf/ -ErrorAction SilentlyContinue diff --git a/scripts/build/windows/install-build-pre-requisites.ps1 b/scripts/build/windows/install-build-pre-requisites.ps1 index 7f1c9b54f..1ea316798 100755 --- a/scripts/build/windows/install-build-pre-requisites.ps1 +++ b/scripts/build/windows/install-build-pre-requisites.ps1 @@ -2,7 +2,7 @@ function Install-Go { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir PATH : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $tempGo = Join-Path -Path $tempDir -ChildPath "gotemp" @@ -10,7 +10,7 @@ function Install-Go { New-Item -Path $tempGo -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $tempGo)) { Write-Host("Invalid tempGo : " + $tempGo + " ") -ForegroundColor Red - exit + exit 1 } $url = "https://dl.google.com/go/go1.15.14.windows-amd64.msi" @@ -35,7 +35,7 @@ function Build-Dependencies { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir PATH : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $tempDependencies = Join-Path -Path $tempDir -ChildPath "gcctemp" @@ -43,7 +43,7 @@ function Build-Dependencies { New-Item -Path $tempDependencies -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $tempDependencies)) { Write-Host("Invalid temp Dir : " + $tempDependencies + " ") -ForegroundColor Red - exit + exit 1 } @@ -82,7 +82,7 @@ function Install-DotNetCoreSDK() { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $dotNetSdkTemp = Join-Path -Path $tempDir -ChildPath "dotNetSdk" @@ -90,7 +90,7 @@ function Install-DotNetCoreSDK() { New-Item -Path $dotNetSdkTemp -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $dotNetSdkTemp)) { Write-Host("Invalid dotNetSdkTemp : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $url = "https://download.visualstudio.microsoft.com/download/pr/4e88f517-196e-4b17-a40c-2692c689661d/eed3f5fca28262f764d8b650585a7278/dotnet-sdk-3.1.301-win-x64.exe" @@ -110,7 +110,7 @@ function Install-Docker() { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir PATH : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $dockerTemp = Join-Path -Path $tempDir -ChildPath "docker" @@ -118,7 +118,7 @@ function Install-Docker() { New-Item -Path $dockerTemp -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $dockerTemp)) { Write-Host("Invalid dockerTemp : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $url = "https://download.docker.com/win/stable/Docker%20Desktop%20Installer.exe" diff --git a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 index dcf73f098..a5d95c31e 100644 --- a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 +++ b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 @@ -39,7 +39,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } @@ -66,7 +66,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -77,7 +77,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -88,7 +88,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -103,7 +103,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -114,7 +114,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAksModule) { @@ -124,7 +124,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az catch { Write-Host("Could not import Az.Aks... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -132,7 +132,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -161,7 +161,7 @@ if ($account.Account -eq $null) { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -181,7 +181,7 @@ else { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -196,7 +196,7 @@ if ($notPresent) { Write-Host("Could not find Aks cluster. Please make sure that specified cluster exists: '" + $clusterName + "'is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked specified cluster exists details...") -ForegroundColor Green diff --git a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 index a791bb18e..32311ca61 100644 --- a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 +++ b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 @@ -30,7 +30,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } @@ -57,7 +57,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -68,7 +68,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -79,7 +79,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -94,7 +94,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -105,7 +105,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAksModule) { @@ -115,7 +115,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ catch { Write-Host("Could not import Az.Aks... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -123,7 +123,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -152,7 +152,7 @@ if ($account.Account -eq $null) { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -172,7 +172,7 @@ else { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -187,7 +187,7 @@ if ($notPresent) { Write-Host("Failed to get Aks clusters in specified subscription. Please make sure that you have access to the existing clusters") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully got all aks clusters ...") -ForegroundColor Green diff --git a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json new file mode 100644 index 000000000..c77e3203d --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json @@ -0,0 +1,210 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "type": "string", + "metadata": { + "description": "AKS Cluster Resource ID" + } + }, + "aksResourceLocation": { + "type": "string", + "metadata": { + "description": "Location of the AKS resource e.g. \"East US\"" + } + }, + "aksResourceTagValues": { + "type": "object", + "metadata": { + "description": "Existing all tags on AKS Cluster Resource" + } + }, + "workspaceLocation": { + "type": "string", + "metadata": { + "description": "Worksapce Location for data collection rule" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Full Resource ID of the log analitycs workspace that will be used for data destination. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.operationalinsights/workspaces/ws_xyz" + } + }, + "dcrResourceTagValues": { + "type": "object", + "metadata": { + "description": "Existing or new tags on DCR Cluster Resource" + } + } + }, + "variables": { + "clusterSubscriptionId": "[split(parameters('aksResourceId'),'/')[2]]", + "clusterResourceGroup": "[split(parameters('aksResourceId'),'/')[4]]", + "clusterName": "[split(parameters('aksResourceId'),'/')[8]]", + "workspaceSubscriptionId": "[split(parameters('workspaceResourceId'),'/')[2]]", + "workspaceResourceGroup": "[split(parameters('workspaceResourceId'),'/')[4]]", + "dcrName": "[Concat('MSCI', '-', split(parameters('workspaceResourceId'),'/')[8])]", + "associationName": "ContainerInsightsExtension", + "dataCollectionRuleId": "[resourceId(variables('workspaceSubscriptionId'), variables('workspaceResourceGroup'), 'Microsoft.Insights/dataCollectionRules', variables('dcrName'))]" + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-msi-dcr', '-', uniqueString(variables('dcrName')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('workspaceSubscriptionId')]", + "resourceGroup": "[variables('workspaceResourceGroup')]", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.Insights/dataCollectionRules", + "apiVersion": "2019-11-01-preview", + "name": "[variables('dcrName')]", + "location": "[parameters('workspaceLocation')]", + "tags": "[parameters('dcrResourceTagValues')]", + "kind": "Linux", + "properties": { + "dataSources": { + "extensions": [ + { + "name": "ContainerInsightsExtension", + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "extensionName": "ContainerInsights" + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "workspaceResourceId": "[parameters('workspaceResourceId')]", + "name": "ciworkspace" + } + ] + }, + "dataFlows": [ + { + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "destinations": [ + "ciworkspace" + ] + } + ] + } + } + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-msi-dcra', '-', uniqueString(parameters('aksResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('clusterSubscriptionId')]", + "resourceGroup": "[variables('clusterResourceGroup')]", + "dependsOn": [ + "[Concat('aks-monitoring-msi-dcr', '-', uniqueString(variables('dcrName')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.ContainerService/managedClusters/providers/dataCollectionRuleAssociations", + "name": "[concat(variables('clusterName'),'/microsoft.insights/', variables('associationName'))]", + "apiVersion": "2019-11-01-preview", + "properties": { + "description": "Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster.", + "dataCollectionRuleId": "[variables('dataCollectionRuleId')]" + } + } + + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-msi-addon', '-', uniqueString(parameters('aksResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('clusterSubscriptionId')]", + "resourceGroup": "[variables('clusterResourceGroup')]", + "dependsOn": [ + "[Concat('aks-monitoring-msi-dcra', '-', uniqueString(parameters('aksResourceId')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[variables('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('aksResourceLocation')]", + "tags": "[parameters('aksResourceTagValues')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[parameters('aksResourceId')]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]", + "useAADAuth": "true" + } + } + } + } + } + ] + }, + "parameters": {} + } + } + ] +} diff --git a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json new file mode 100644 index 000000000..31f0f9c49 --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json @@ -0,0 +1,32 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "value": "/subscriptions//resourcegroups//providers/Microsoft.ContainerService/managedClusters/" + }, + "aksResourceLocation": { + "value": "" + }, + "aksResourceTagValues": { + "value": { + "": "", + "": "", + "": "" + } + }, + "workspaceResourceId": { + "value": "/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/" + }, + "workspaceLocation": { + "value": "" + }, + "dcrResourceTagValues": { + "value": { + "": "", + "": "", + "": "" + } + } + } + } diff --git a/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 b/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 index 29f629878..a0965f960 100644 --- a/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 +++ b/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 @@ -64,7 +64,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -89,7 +89,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } try { Write-Host("Installing Az.Accounts...") @@ -97,7 +97,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -109,7 +109,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { Write-Host("Could not import Az.Resources ...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } try { Import-Module Az.Accounts @@ -117,14 +117,14 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { catch { Write-Host("Could not import Az.Accounts... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -138,7 +138,7 @@ if ($NameoftheCloud -like "AzureCloud" -or } else { Write-Host("Error: Monitoring not supported in this cloud: $NameoftheCloud") -ForegroundColor Red - exit + exit 1 } # @@ -151,7 +151,7 @@ if ($notPresent) { Write-Host("Could not find RG. Please make sure that the resource group name: '" + $ResourceGroupName + "'is correct and you have access to the aks-engine cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked resource groups details...") -ForegroundColor Green @@ -179,20 +179,20 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { } else { Write-Host("Resource group name: '" + $ResourceGroupName + "'is doesnt have the aks-engine resources") -ForegroundColor Red - exit + exit 1 } } if ($isKubernetesCluster -eq $false) { Write-Host("Resource group name: '" + $ResourceGroupName + "' doesnt have the aks-engine or acs-engine resources") -ForegroundColor Red - exit + exit 1 } # validate specified logAnalytics workspace exists or not $workspaceResource = Get-AzResource -ResourceId $LogAnalyticsWorkspaceResourceId if ($null -eq $workspaceResource) { Write-Host("Specified Log Analytics workspace ResourceId: '" + $LogAnalyticsWorkspaceResourceId + "' doesnt exist or don't have access to it") -ForegroundColor Red - exit + exit 1 } # @@ -202,11 +202,11 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { $r = Get-AzResource -ResourceGroupName $ResourceGroupName -ResourceName $k8MasterVM.Name if ($null -eq $r) { Write-Host("Get-AzResource for Resource Group: " + $ResourceGroupName + "Resource Name :" + $k8MasterVM.Name + " failed" ) -ForegroundColor Red - exit + exit 1 } if ($null -eq $r.Tags) { Write-Host("K8s master VM should have the tags" ) -ForegroundColor Red - exit + exit 1 } if ($r.Tags.ContainsKey("logAnalyticsWorkspaceResourceId")) { $existingLogAnalyticsWorkspaceResourceId = $r.Tags["logAnalyticsWorkspaceResourceId"] @@ -225,7 +225,7 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { $existingclusterName = $r.Tags["clusterName"] if ($existingclusterName -eq $ClusterName) { Write-Host("Ignoring attaching clusterName tag to K8s master VM :" + $k8MasterVM.Name + " since it has already with same tag value" ) -ForegroundColor Yellow - exit + exit 1 } Write-Host("K8s master VM :" + $k8MasterVM.Name + " has the existing tag for clusterName with different from specified one" ) -ForegroundColor Green $r.Tags.Remove("clusterName") diff --git a/scripts/onboarding/managed/disable-monitoring.ps1 b/scripts/onboarding/managed/disable-monitoring.ps1 index bcd135dba..8be60c50d 100644 --- a/scripts/onboarding/managed/disable-monitoring.ps1 +++ b/scripts/onboarding/managed/disable-monitoring.ps1 @@ -61,7 +61,7 @@ if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { } else { Write-Host("Specified Azure Cloud name is : $azureCloudName") Write-Host("Only supported Azure clouds are : AzureCloud and AzureUSGovernment") - exit + exit 1 } } @@ -89,7 +89,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -116,7 +116,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -127,7 +127,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -139,7 +139,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -154,7 +154,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -165,7 +165,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -176,7 +176,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -184,14 +184,14 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } if ([string]::IsNullOrEmpty($clusterResourceId)) { Write-Host("Specified Azure ClusterResourceId should not be NULL or empty") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($kubeContext)) { @@ -211,7 +211,7 @@ if ($clusterResourceId.StartsWith("/") -eq $false) { if ($clusterResourceId.Split("/").Length -ne 9){ Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red - exit + exit 1 } if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -ne $true) -and @@ -219,7 +219,7 @@ if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedcluste ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -ne $true) ) { Write-Host("Provided cluster ResourceId is not supported cluster type: $clusterResourceId") -ForegroundColor Red - exit + exit 1 } if ($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -eq $true) { @@ -284,7 +284,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -304,7 +304,7 @@ else { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -314,7 +314,7 @@ Write-Host("Checking specified Azure Managed cluster resource exists and got acc $clusterResource = Get-AzResource -ResourceId $clusterResourceId if ($null -eq $clusterResource) { Write-Host("specified Azure Managed cluster resource id either you dont have access or doesnt exist") -ForegroundColor Red - exit + exit 1 } $clusterRegion = $clusterResource.Location.ToLower() @@ -323,7 +323,7 @@ if ($isArcK8sCluster -eq $true) { $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() if ($clusterIdentity.Contains("systemassigned") -eq $false) { Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red - exit + exit 1 } } @@ -345,7 +345,7 @@ try { $releases = helm list --filter $helmChartReleaseName if ($releases.Count -lt 2) { Write-Host("There is no existing release with name : $helmChartReleaseName") -ForegroundColor Yellow - exit + exit 1 } for($index =0 ; $index -lt $releases.Count ; $index ++ ) { @@ -360,7 +360,7 @@ try { $releases = helm list --filter $helmChartReleaseName --kube-context $kubeContext if ($releases.Count -lt 2) { Write-Host("There is no existing release with name : $helmChartReleaseName") -ForegroundColor Yellow - exit + exit 1 } for($index =0 ; $index -lt $releases.Count ; $index ++ ) { @@ -374,7 +374,7 @@ try { } catch { Write-Host ("Failed to delete Azure Monitor for containers HELM chart : '" + $Error[0] + "' ") -ForegroundColor Red - exit + exit 1 } Write-Host("Successfully disabled Azure Monitor for containers for cluster: $clusteResourceId") -ForegroundColor Green diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index e79ef2138..b72e10e02 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -81,7 +81,7 @@ if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { } else { Write-Host("Specified Azure Cloud name is : $azureCloudName") Write-Host("Only supported azure clouds are : AzureCloud and AzureUSGovernment") - exit + exit 1 } } @@ -109,7 +109,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -136,7 +136,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -147,7 +147,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -159,7 +159,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -174,7 +174,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -185,7 +185,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -196,7 +196,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -204,14 +204,14 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } if ([string]::IsNullOrEmpty($clusterResourceId)) { Write-Host("Specified Azure Arc enabled Kubernetes ClusterResourceId should not be NULL or empty") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($kubeContext)) { @@ -232,7 +232,7 @@ if ($clusterResourceId.StartsWith("/") -eq $false) { if ($clusterResourceId.Split("/").Length -ne 9) { Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red - exit + exit 1 } if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -ne $true) -and @@ -240,7 +240,14 @@ if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedcluste ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -ne $true) ) { Write-Host("Provided cluster ResourceId is not supported cluster type: $clusterResourceId") -ForegroundColor Red - exit + exit 1 +} + +if (([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and + ([string]::IsNullOrEmpty($servicePrincipalClientSecret) -eq $false) -and + ([string]::IsNullOrEmpty($tenantId) -eq $false)) { + Write-Host("Using service principal creds for the azure login since these provided.") + $isUsingServicePrincipal = $true } if (([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and @@ -305,7 +312,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -325,7 +332,7 @@ else { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -335,7 +342,7 @@ Write-Host("Checking specified Azure Managed cluster resource exists and got acc $clusterResource = Get-AzResource -ResourceId $clusterResourceId if ($null -eq $clusterResource) { Write-Host("specified Azure Managed cluster resource id either you dont have access or doesnt exist") -ForegroundColor Red - exit + exit 1 } $clusterRegion = $clusterResource.Location.ToLower() @@ -344,7 +351,7 @@ if ($isArcK8sCluster -eq $true) { $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() if ($clusterIdentity.contains("systemassigned") -eq $false) { Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red - exit + exit 1 } } @@ -450,7 +457,7 @@ else { Write-Host("using specified Log Analytics Workspace ResourceId: '" + $workspaceResourceId + "' ") if ([string]::IsNullOrEmpty($workspaceResourceId)) { Write-Host("Specified workspaceResourceId should not be NULL or empty") -ForegroundColor Red - exit + exit 1 } $workspaceResourceId = $workspaceResourceId.Trim() if ($workspaceResourceId.EndsWith("/")) { @@ -465,7 +472,7 @@ else { if (($workspaceResourceId.ToLower().Contains("microsoft.operationalinsights/workspaces") -ne $true) -or ($workspaceResourceId.Split("/").Length -ne 9)) { Write-Host("Provided workspace resource id should be in this format /subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/") -ForegroundColor Red - exit + exit 1 } $workspaceResourceParts = $workspaceResourceId.Split("/") @@ -482,7 +489,7 @@ else { $WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroup -Name $workspaceName -ErrorAction SilentlyContinue if ($null -eq $WorkspaceInformation) { Write-Host("Specified Log Analytics Workspace: '" + $workspaceName + "' in Resource Group: '" + $workspaceResourceGroup + "' in Subscription: '" + $workspaceSubscriptionId + "' does not exist") -ForegroundColor Red - exit + exit 1 } } @@ -520,7 +527,7 @@ try { } catch { Write-Host ("Failed to workspace details. Please validate whether you have Log Analytics Contributor role on the workspace error: '" + $Error[0] + "' ") -ForegroundColor Red - exit + exit 1 } diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json index 95e7ba5d0..b2b61f4ab 100644 --- a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -13,7 +13,7 @@ "metadata": { "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" } - }, + }, "workspaceResourceId": { "type": "string", "metadata": { @@ -83,7 +83,7 @@ "subscriptionId": "[split(parameters('clusterResourceId'),'/')[2]]", "resourceGroup": "[split(parameters('clusterResourceId'),'/')[4]]", "dependsOn": [ - "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]" + "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]" ], "properties": { "mode": "Incremental", @@ -95,7 +95,7 @@ "resources": [ { "type": "Microsoft.KubernetesConfiguration/extensions", - "apiVersion": "2020-07-01-preview", + "apiVersion": "2021-09-01", "name": "azuremonitor-containers", "location": "[parameters('clusterRegion')]", "identity": {"type": "systemassigned"}, @@ -107,7 +107,7 @@ }, "configurationProtectedSettings": { "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", - "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" }, "autoUpgradeMinorVersion": true, "releaseTrain": "Stable", diff --git a/scripts/troubleshoot/README.md b/scripts/troubleshoot/README.md index 5ffa07639..650a5df6f 100644 --- a/scripts/troubleshoot/README.md +++ b/scripts/troubleshoot/README.md @@ -1,5 +1,14 @@ # Troubleshoot Guide for Azure Monitor for containers +# Azure Arc-enabled Kubernetes +The table below summarizes known issues you may face while using Azure Monitor for containers . + +| Issues and Error Messages | Action | +| ---- | --- | +| Error Message `No data for selected filters` | It may take some time to establish monitoring data flow for newly created clusters. Please allow at least 10-15 minutes for data to appear for your cluster. | +| Error Message `Error retrieving data` | While Azure Arc-enabled Kubernetes cluster is setting up for health and performance monitoring, a connection is established between the cluster and Azure Log Analytics workspace. Log Analytics workspace is used to store all monitoring data for your cluster. This error may occurr when your Log Analytics workspace has been deleted or lost. Please check whether your Log Analytics workspace is available. To find your Log Analytics workspace go [here.](https://docs.microsoft.com/en-us/azure/log-analytics/log-analytics-manage-access) and your workspace is available. If the workspace is missing, you will have to delete and create Microsoft.AzureMonitor.Containers extension https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters?toc=/azure/azure-arc/kubernetes/toc.json. | + + # Azure Kubernetes Service (AKS) The table below summarizes known issues you may face while using Azure Monitor for containers . @@ -67,5 +76,26 @@ Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respon For more details on Azure Resource Manager template deployment via cli refer to [this documentation](https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-template-deploy-cli). If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: -* File a [GitHub Issue](https://github.com/Microsoft/OMS-docker/issues) +* File a [GitHub Issue](https://github.com/microsoft/Docker-Provider/issues) * Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.txt in the email generated by the troubleshooting script if you had tried running the script to solve your problem. + +# Azure Arc-enabled Kubernetes + +You can use the troubleshooting script provided [here](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/troubleshoot/troubleshooterrors.sh) to diagnose the problem. + +Steps: +- Before executing the Troubleshooting script, please install following pre-requisistes if you dont have already + - Install [Azure-CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) + - Install [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) + - Install [jq](https://stedolan.github.io/jq/download/) +- Download and execute the script + ``` bash + curl -LO https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/troubleshoot/troubleshooterrors.sh + bash troubleshooterrors.sh --resource-id --kube-context + ``` +- This script will generate a TroubleshootDump.log which collects detailed information about container health onboarding. +Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respond back to you. + +If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: +* File a [GitHub Issue](https://github.com/microsoft/Docker-Provider/issues) +* Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.log in the email generated by the troubleshooting script if you had tried running the script to solve your problem. \ No newline at end of file diff --git a/scripts/troubleshoot/TroubleshootError.ps1 b/scripts/troubleshoot/TroubleshootError.ps1 index 4c2d95ac6..6d97c53d5 100644 --- a/scripts/troubleshoot/TroubleshootError.ps1 +++ b/scripts/troubleshoot/TroubleshootError.ps1 @@ -35,7 +35,7 @@ if (($null -eq $ClusterResourceId) -or ($ClusterResourceId.Split("/").Length -ne Write-Host("Resource Id Format for AKS cluster is : /subscriptions//resourceGroups//providers/Microsoft.ContainerService/managedClusters/") -ForegroundColor Red Write-Host("Resource Id Format for ARO cluster is : /subscriptions//resourceGroups//providers/Microsoft.ContainerService/openShiftManagedClusters/") -ForegroundColor Red Stop-Transcript - exit + exit 1 } $isClusterAndWorkspaceInDifferentSubs = $false @@ -70,7 +70,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -97,7 +97,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.ResourceGraph in a new powershell window: eg. 'Install-Module Az.ResourceGraph -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAksModule) { @@ -108,7 +108,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -120,7 +120,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -132,7 +132,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -145,7 +145,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -159,7 +159,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not Import Az.ResourceGraph...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.ResourceGraph in a new powershell window: eg. 'Install-Module Az.ResourceGraph -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -171,7 +171,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not Import Az.Aks...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -183,7 +183,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -194,7 +194,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -205,7 +205,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -213,7 +213,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -277,7 +277,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $ClusterSubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -297,7 +297,7 @@ else { Write-Host("Could not select subscription with ID : " + $ClusterSubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -313,7 +313,7 @@ if ($notPresent) { Write-Host("Could not find RG. Please make sure that the resource group name: '" + $ResourceGroupName + "'is correct and you have access to the Resource Group") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked resource groups details...") -ForegroundColor Green @@ -327,7 +327,7 @@ try { Write-Host("Could not fetch cluster details: Please make sure that the '" + $ClusterType + "' Cluster name: '" + $ClusterName + "' is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } else { Write-Host("Successfully checked '" + $ClusterType + "' Cluster details...") -ForegroundColor Green @@ -342,7 +342,7 @@ try { Write-Host($AksOptInLink) -ForegroundColor Red; Write-Host(""); Stop-Transcript - exit + exit 1 } $omsagentconfig = $props.addonprofiles.omsagent.config; @@ -364,7 +364,7 @@ try { Write-Host("Could not fetch cluster details: Please make sure that the '" + $ClusterType + "' Cluster name: '" + $ClusterName + "' is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $monitorProfile = $ResourceDetail.aroproperties.monitorprofile @@ -373,7 +373,7 @@ try { Write-Host($AksOptInLink) -ForegroundColor Red; Write-Host(""); Stop-Transcript - exit + exit 1 } $LogAnalyticsWorkspaceResourceID = $monitorProfile.workspaceresourceid @@ -385,7 +385,7 @@ catch { Write-Host("Could not fetch cluster details: Please make sure that the '" + $ClusterType + "' Cluster name: '" + $ClusterName + "' is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -511,7 +511,7 @@ if ($null -eq $LogAnalyticsWorkspaceResourceID) { } Write-Host("") Stop-Transcript - exit + exit 1 } else { @@ -532,7 +532,7 @@ else { Write-Host("Could not change to Workspace subscriptionId : '" + $workspaceSubscriptionId + "'." ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -557,7 +557,7 @@ else { } Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace subcription details...") -ForegroundColor Green Write-Host("") @@ -581,7 +581,7 @@ else { Write-Host("Opt-in - " + $AksOptInLink) -ForegroundColor Red } Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace resource group...") -ForegroundColor Green Write-Host("") @@ -610,7 +610,7 @@ else { } Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspaceLocation = $WorkspaceInformation.Location @@ -619,7 +619,7 @@ else { Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspacePricingTier = $WorkspaceInformation.sku @@ -635,7 +635,7 @@ else { Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } try { @@ -647,7 +647,7 @@ else { Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] @@ -711,7 +711,7 @@ try { if ($WorkspaceUsage.CurrentValue -ge $WorkspaceUsage.Limit) { Write-Host("Workspace usage has reached or over the configured daily cap. Please increase the daily cap limits or wait for next reset interval") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } Write-Host("Workspace doesnt have daily cap configured") -ForegroundColor Green @@ -720,7 +720,7 @@ catch { Write-Host("Failed to get usage details of the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -757,7 +757,7 @@ if ("AKS" -eq $ClusterType ) { Write-Host($AksOptInLink) -ForegroundColor Red Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } $rsPodStatus = $rsPod.status @@ -778,7 +778,7 @@ if ("AKS" -eq $ClusterType ) { Write-Host($AksOptInLink) -ForegroundColor Red Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent replicaset pod running OK.") -ForegroundColor Green @@ -786,7 +786,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to get omsagent replicatset pod info using kubectl get rs : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent daemonset pod running correctly ...") @@ -795,7 +795,7 @@ if ("AKS" -eq $ClusterType ) { if (($null -eq $ds) -or ($null -eq $ds.Items) -or ($ds.Items.Length -ne 1)) { Write-Host( "omsagent replicaset pod not scheduled or failed to schedule." + $contactUSMessage) Stop-Transcript - exit + exit 1 } $dsStatus = $ds.Items[0].status @@ -809,7 +809,7 @@ if ("AKS" -eq $ClusterType ) { Write-Host($dsStatus) Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent daemonset pod running OK.") -ForegroundColor Green @@ -817,7 +817,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent heatlhservice running correctly ...") @@ -826,7 +826,7 @@ if ("AKS" -eq $ClusterType ) { if ($healthservice.Items.Length -ne 1) { Write-Host( "omsagent healthservice not scheduled or failed to schedule." + $contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent healthservice running OK.") -ForegroundColor Green @@ -834,7 +834,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to execute kubectl get services command : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ($isClusterAndWorkspaceInDifferentSubs) { @@ -851,7 +851,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to get workspace details. Please validate whether you have Log Analytics Contributor role on the workspace error: '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the WorkspaceGuid and key matching with configured log analytics workspace ...") @@ -862,7 +862,7 @@ if ("AKS" -eq $ClusterType ) { if ((($workspaceGuidConfiguredOnAgent -eq $workspaceGUID) -and ($workspaceKeyConfiguredOnAgent -eq $workspacePrimarySharedKey)) -eq $false) { Write-Host ("Error - Log Analytics Workspace Guid and key configured on the agent not matching with details of the Workspace. Please verify and fix with the correct workspace Guid and Key") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Workspace Guid and Key on the agent matching with the Workspace") -ForegroundColor Green @@ -870,7 +870,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking agent version...") @@ -885,7 +885,7 @@ if ("AKS" -eq $ClusterType ) { } catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } diff --git a/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 b/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 index 1f1e1ba5d..5662d3f79 100644 --- a/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 +++ b/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 @@ -45,7 +45,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } $message = "This script will try to install the latest versions of the following Modules : ` @@ -69,7 +69,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o } catch { Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.profile in a new powershell window: eg. 'Install-Module AzureRM.profile -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } try { Write-Host("Installing AzureRM.Resources...") @@ -77,7 +77,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o } catch { Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.Resoureces in a new powershell window: eg. 'Install-Module AzureRM.Resoureces -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } try { @@ -86,7 +86,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o } catch { Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.OperationalInsights in a new powershell window: eg. 'Install-Module AzureRM.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } 1 { @@ -97,7 +97,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o Write-Host("Could not import AzureRM.profile...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.profile in a new powershell window: eg. 'Install-Module AzureRM.profile -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } try { Import-Module AzureRM.Resources @@ -105,7 +105,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o catch { Write-Host("Could not import AzureRM.Resources... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } try { Import-Module AzureRM.OperationalInsights @@ -113,7 +113,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o catch { Write-Host("Could not import AzureRM.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Running troubleshooting script... Please reinstall this Module") Write-Host("") @@ -121,7 +121,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -151,7 +151,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -171,7 +171,7 @@ else { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -187,7 +187,7 @@ if ($notPresent) { Write-Host("Could not find RG. Please make sure that the resource group name: '" + $ResourceGroupName + "'is correct and you have access to the Resource Group") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked resource groups details...") -ForegroundColor Green @@ -197,13 +197,13 @@ Write-Host("Successfully checked resource groups details...") -ForegroundColor G if ([string]::IsNullOrEmpty($KubeConfig)) { Write-Host("KubeConfig should not be NULL or empty") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ((Test-Path $KubeConfig -PathType Leaf) -ne $true) { Write-Host("provided KubeConfig path : '" + $KubeConfig + "' doesnt exist or you dont have read access") -ForegroundColor Red Stop-Transcript - exit + exit 1 } # @@ -249,13 +249,13 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { } else { Write-Host("This Resource group : '" + $ResourceGroupName + "'does not have the AKS-engine or ACS-Engine Kubernetes resources") -ForegroundColor Red - exit + exit 1 } } if ($isKubernetesCluster -eq $false) { Write-Host("Monitoring only supported for AKS-Engine or ACS-Engine with Kubernetes") -ForegroundColor Red - exit + exit 1 } Write-Host("Successfully checked the AKS-Engine or ACS-Engine Kuberentes cluster resources in specified resource group") -ForegroundColor Green @@ -270,7 +270,7 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { if ($null -eq $r) { Write-Host("Get-AzureRmResource for Resource Group: " + $ResourceGroupName + "Resource Name :" + $k8MasterVM.Name + " failed" ) -ForegroundColor Red - exit + exit 1 } if ($null -eq $r.Tags) { @@ -279,7 +279,7 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { Write-Host("Please try to opt out of monitoring and opt-in using the following links:") -ForegroundColor Red Write-Host("Opt-out - " + $OptOutLink) -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red - exit + exit 1 } if ($r.Tags.ContainsKey("logAnalyticsWorkspaceResourceId")) { @@ -300,7 +300,7 @@ if ($null -eq $LogAnalyticsWorkspaceResourceID) { Write-Host("There is no existing logAnalyticsWorkspaceResourceId tag on AKS-Engine k8 master nodes or VMSSes so this indicates this cluster not enabled monitoring or tags have been removed" ) -ForegroundColor Red Write-Host("Please try to opt-in for monitoring using the following links:") -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red - exit + exit 1 } else { @@ -309,7 +309,7 @@ else { Write-Host("Please add the clusterName tag with the value of clusterName used during the omsagent agent onboarding. Refer below link for details:") -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red - exit + exit 1 } Write-Host("Configured LogAnalyticsWorkspaceResourceId: : '" + $LogAnalyticsWorkspaceResourceID + "' ") @@ -328,7 +328,7 @@ else { Write-Host("Could not change to Workspace subscriptionId : '" + $workspaceSubscriptionId + "'." ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -347,7 +347,7 @@ else { Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace subcription details...") -ForegroundColor Green Write-Host("") @@ -364,7 +364,7 @@ else { Write-Host("Opt-out - " + $OptOutLink) -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace resource group...") -ForegroundColor Green Write-Host("") @@ -386,7 +386,7 @@ else { Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspaceLocation = $WorkspaceInformation.Location @@ -396,7 +396,7 @@ else { Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspacePricingTier = $WorkspaceInformation.sku @@ -413,7 +413,7 @@ else { Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } try { @@ -425,7 +425,7 @@ else { Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] @@ -498,7 +498,7 @@ try { } catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("") diff --git a/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 b/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 index 14b080b23..76bbad16c 100644 --- a/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 +++ b/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 @@ -47,25 +47,25 @@ Write-Host("LogAnalyticsWorkspaceResourceId: : '" + $azureLogAnalyticsWorkspaceR if (($azureLogAnalyticsWorkspaceResourceId.ToLower().Contains("microsoft.operationalinsights/workspaces") -ne $true) -or ($azureLogAnalyticsWorkspaceResourceId.Split("/").Length -ne 9)) { Write-Host("Provided Azure Log Analytics resource id should be in this format /subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ([string]::IsNullOrEmpty($kubeConfig)) { Write-Host("kubeConfig should not be NULL or empty") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ((Test-Path $kubeConfig -PathType Leaf) -ne $true) { Write-Host("provided kubeConfig path : '" + $kubeConfig + "' doesnt exist or you dont have read access") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ([string]::IsNullOrEmpty($clusterContextInKubeconfig)) { Write-Host("provide clusterContext should be valid context in the provided kubeconfig") -ForegroundColor Red Stop-Transcript - exit + exit 1 } # checks the all required Powershell modules exist and if not exists, request the user permission to install @@ -92,7 +92,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -120,7 +120,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -132,7 +132,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -145,7 +145,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -160,7 +160,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -171,7 +171,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -182,7 +182,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -190,7 +190,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -222,7 +222,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -242,7 +242,7 @@ else { Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -253,7 +253,7 @@ $workspaceResource = Get-AzResource -ResourceId $azureLogAnalyticsWorkspaceResou if ($null -eq $workspaceResource) { Write-Host("specified Azure Log Analytics resource id: " + $azureLogAnalyticsWorkspaceResourceId + ". either you dont have access or doesnt exist") -ForegroundColor Red Stop-Transcript - exit + exit 1 } # @@ -272,7 +272,7 @@ catch { Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspaceLocation = $WorkspaceInformation.Location @@ -281,7 +281,7 @@ if ($null -eq $WorkspaceLocation) { Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspacePricingTier = $WorkspaceInformation.sku @@ -297,7 +297,7 @@ catch { Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } try { @@ -309,7 +309,7 @@ catch { Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] @@ -317,7 +317,7 @@ if ($isSolutionOnboarded) { if ($WorkspacePricingTier -eq "Free") { Write-Host("Pricing tier of the configured LogAnalytics workspace is Free so you may need to upgrade to pricing tier to non-Free") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } else { @@ -356,13 +356,13 @@ else { Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red Write-Host($contactUSMessage) -ForegroundColor Red Stop-Transcript - exit + exit 1 } } else { Write-Host("The container health solution isn't onboarded to your cluster. This required for the monitoring to work.") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -382,7 +382,7 @@ try { if ($null -eq $rsPod) { Write-Host( "omsagent replicaset pod not scheduled or failed to scheduled." + $contactUSMessage) -ForegroundColor Red Stop-Transcript - exit + exit 1 } $rsPodStatus = $rsPod.status if ((($rsPodStatus.availableReplicas -eq 1) -and @@ -393,7 +393,7 @@ try { Write-Host($rsPodStatus) Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent replicaset pod running OK.") -ForegroundColor Green @@ -401,7 +401,7 @@ try { catch { Write-Host ("Failed to get omsagent replicatset pod info using kubectl get rs : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent daemonset pod running correctly ...") @@ -410,7 +410,7 @@ try { if ($ds.Items.Length -ne 1) { Write-Host( "omsagent replicaset pod not scheduled or failed to schedule." + $contactUSMessage) -ForegroundColor Red Stop-Transcript - exit + exit 1 } $dsStatus = $ds.Items[0].status @@ -424,7 +424,7 @@ try { Write-Host($rsPodStatus) Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent daemonset pod running OK.") -ForegroundColor Green @@ -432,7 +432,7 @@ try { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent heatlhservice running correctly ...") @@ -441,7 +441,7 @@ try { if ($healthservice.Items.Length -ne 1) { Write-Host( "omsagent healthservice not scheduled or failed to schedule." + $contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent healthservice pod running OK.") -ForegroundColor Green @@ -449,7 +449,7 @@ try { catch { Write-Host ("Failed to execute kubectl get services command : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Retrieving WorkspaceGUID and WorkspacePrimaryKey of the workspace : " + $WorkspaceInformation.Name) @@ -462,7 +462,7 @@ try { catch { Write-Host ("Failed to workspace details. Please validate whether you have Log Analytics Contributor role on the workspace error: '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the WorkspaceGuid and key matching with configured log analytics workspace ...") @@ -473,7 +473,7 @@ try { if ((($workspaceGuidConfiguredOnAgent -eq $workspaceGUID) -and ($workspaceKeyConfiguredOnAgent -eq $workspacePrimarySharedKey)) -eq $false) { Write-Host ("Error - Log Analytics Workspace Guid and key configured on the agent not matching with details of the Workspace. Please verify and fix with the correct workspace Guid and Key") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Workspace Guid and Key on the agent matching with the Workspace") -ForegroundColor Green @@ -481,7 +481,7 @@ try { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking agent version...") @@ -497,7 +497,7 @@ try { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("resetting cluster context back, what it was before") diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh new file mode 100644 index 000000000..ac08d7afc --- /dev/null +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -0,0 +1,485 @@ +#!/bin/bash +# +# This script troubleshoots errors related to onboarding of Azure Monitor for containers to Kubernetes cluster hosted outside and connected to Azure via Azure Arc cluster +# Prerequisites : +# Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest + +# bash troubelshooterror.sh --resource-id --kube-context + +set -e +set -o pipefail + +logFile="TroubleshootDump.log" +clusterType="connectedClusters" +extensionInstanceName="azuremonitor-containers" +# resource type for azure log analytics workspace +workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" +workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" +agentK8sNamespace="kube-system" +azureArcK8sNamespace="azure-arc" +agentK8sSecretName="omsagent-secret" +agentK8sDeploymentName="omsagent-rs" +agentK8sLinuxDaemonsetName="omsagent" +agentArcK8sIdentityCRDName="container-insights-clusteridentityrequest" +workspaceId="" +workspacePrimarySharedKey="" +contactUSMessage="Please contact us by emailing askcoin@microsoft.com if you need any help with TroubleshootDump.log generated by this script" +dataCapHelpMessage="Please review and increase data cap https://docs.microsoft.com/en-us/azure/azure-monitor/logs/manage-cost-storage" +workspacePrivateLinkMessage="Please review this doc https://docs.microsoft.com/en-us/azure/azure-monitor/logs/private-link-security" +azureCLIInstallLinkMessage="Please install Azure-CLI as per the instructions https://docs.microsoft.com/en-us/cli/azure/install-azure-cli and rerun the troubleshooting script" +kubectlInstallLinkMessage="Please install kubectl as per the instructions https://kubernetes.io/docs/tasks/tools/#kubectl and rerun the troubleshooting script" +jqInstallLinkMessage="Please install jq as per instructions https://stedolan.github.io/jq/download/ and rerun the troubleshooting script" +ciExtensionReOnboarding="Please reinstall extension as per instructions https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters?toc=/azure/azure-arc/kubernetes/toc.json" +timesyncHelpMessage="Please check if you have any timesync issues on your cluster nodes" + +log_message() { + echo "$@" + echo "" + echo "$@" >> $logFile +} + + +login_to_azure() { + if [ "$isUsingServicePrincipal" = true ]; then + log_message "login to the azure using provided service principal creds" + az login --service-principal --username="$servicePrincipalClientId" --password="$servicePrincipalClientSecret" --tenant="$servicePrincipalTenantId" + else + log_message "login to the azure interactively" + az login --use-device-code + fi +} + +set_azure_subscription() { + local subscriptionId="$(echo ${1})" + log_message "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + az account set -s ${subscriptionId} + log_message "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" +} + +usage() { + local basename=$(basename $0) + echo + echo "Troubleshooting Errors related to Azure Monitor for containers:" + echo "$basename --resource-id [--kube-context ]" +} + +parse_args() { + + if [ $# -le 1 ]; then + usage + exit 1 + fi + + # Transform long options to short ones + for arg in "$@"; do + shift + case "$arg" in + "--resource-id") set -- "$@" "-r" ;; + "--kube-context") set -- "$@" "-k" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" ;; + esac + done + + local OPTIND opt + + while getopts 'hk:r:' opt; do + case "$opt" in + h) + usage + ;; + + k) + kubeconfigContext="$OPTARG" + log_message "name of kube-context is $OPTARG" + ;; + + r) + clusterResourceId="$OPTARG" + log_message "clusterResourceId is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND - 1))" + + local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" + local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" + + # get resource parts and join back to get the provider name + local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" + local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" + local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2})" + + local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" + + # convert to lowercase for validation + providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") + + log_message "cluster SubscriptionId:" $subscriptionId + log_message "cluster ResourceGroup:" $resourceGroup + log_message "cluster ProviderName:" $providerName + log_message "cluster Name:" $clusterName + + if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then + log_message "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" + exit 1 + fi + + if [[ $providerName != microsoft.* ]]; then + log_message "-e invalid azure cluster resource id format." + exit 1 + fi + + # detect the resource provider from the provider name in the cluster resource id + if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then + log_message "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" + isArcK8sCluster=true + resourceProvider=$arcK8sResourceProvider + else + log_message "-e not valid azure arc enabled kubernetes cluster resource id" + exit 1 + fi + + if [ -z "$kubeconfigContext" ]; then + log_message "using or getting current kube config context since --kube-context parameter not set " + fi + + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + log_message "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi +} + +command_exists() { + command -v "$@" > /dev/null 2>&1 +} + +validate_ci_extension() { + log_message "START:validate_ci_extension" + extension=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName) + log_message $extension + configurationSettings=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings") + if [ -z "$configurationSettings" ]; then + log_message "-e error configurationSettings either null or empty" + log_message ${contactUSMessage} + exit 1 + fi + logAnalyticsWorkspaceResourceID=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID" -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "Extension logAnalyticsWorkspaceResourceID: ${logAnalyticsWorkspaceResourceID}" + if [ -z "$logAnalyticsWorkspaceResourceID" ]; then + log_message "-e error logAnalyticsWorkspaceResourceID either null or empty in the config settings" + log_message ${contactUSMessage} + exit 1 + fi + + provisioningState=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "provisioningState" -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "Extension provisioningState: ${provisioningState}" + if [ -z "$provisioningState" ]; then + log_message "-e error provisioningState either null or empty in the config settings" + log_message ${contactUSMessage} + exit 1 + fi + if [ "$provisioningState" != "succeeded" ]; then + log_message "-e error expected state of extension provisioningState MUST be succeeded state but actual state is ${provisioningState}" + log_message ${contactUSMessage} + exit 1 + fi + logAnalyticsWorkspaceDomain=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query 'configurationSettings."omsagent.domain"') + log_message "Extension logAnalyticsWorkspaceDomain: ${logAnalyticsWorkspaceDomain}" + if [ -z "$logAnalyticsWorkspaceDomain" ]; then + log_message "-e error logAnalyticsWorkspaceDomain either null or empty in the config settings" + log_message ${contactUSMessage} + exit 1 + fi + azureCloudName=${1} + if [ "$azureCloudName" = "azureusgovernment" ]; then + log_message "az cli configured cloud name:$azureCloudName" + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.us" ]; then + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.us but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} + exit 1 + fi + elif [ "$azureCloudName" = "azurecloud" ]; then + log_message "az cli configured cloud name:$azureCloudName" + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.com" ]; then + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.com but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} + exit 1 + fi + elif [ "$azureCloudName" = "azurechinacloud" ]; then + log_message "az cli configured cloud name:$azureCloudName" + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.cn" ]; then + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.cn but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} + exit 1 + fi + fi + + workspaceSubscriptionId="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" + workspaceResourceGroup="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f5)" + workspaceName="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f9)" + log_message "workspaceSubscriptionId:${workspaceSubscriptionId} workspaceResourceGroup:${workspaceResourceGroup} workspaceName:${workspaceName}" + + clusterSubscriptionId=${2} + # set the azure subscription to azure cli if the workspace in different sub than cluster + if [[ "$clusterSubscriptionId" != "$workspaceSubscriptionId" ]]; then + log_message "switch subscription id of workspace as active subscription for azure cli since workspace in different subscription than cluster: ${workspaceSubscriptionId}" + isClusterAndWorkspaceInSameSubscription=false + set_azure_subscription $workspaceSubscriptionId + fi + workspaceList=$(az resource list -g "$workspaceResourceGroup" -n "$workspaceName" --resource-type $workspaceResourceProvider) + log_message "workspace info:${workspaceList}" + if [ "$workspaceList" = "[]" ]; then + log_message "-e error workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" + exit 1 + fi + + ciSolutionResourceName="ContainerInsights(${workspaceName})" + workspaceSolutionList=$(az resource list -g $workspaceResourceGroup -n $ciSolutionResourceName --resource-type $workspaceSolutionResourceProvider) + log_message "workspace solution info:${workspaceSolutionList}" + if [ "$workspaceSolutionList" = "[]" ]; then + log_message "-e error ContainerInsights solution on workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" + exit 1 + fi + + privateLinkScopedResources=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.privateLinkScopedResources -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "workspace privateLinkScopedResources:${privateLinkScopedResources}" + + publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "workspace publicNetworkAccessForIngestion:${publicNetworkAccessForIngestion}" + if [ -z "$privateLinkScopedResources" ]; then + if [ "$publicNetworkAccessForIngestion" != "enabled" ]; then + log_message "-e error Unless private link configuration, publicNetworkAccessForIngestion MUST be enabled for data ingestion" + log_message ${workspacePrivateLinkMessage} + exit 1 + fi + fi + publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "workspace publicNetworkAccessForQuery:${publicNetworkAccessForQuery}" + if [ -z "$privateLinkScopedResources" ]; then + if [ "$publicNetworkAccessForQuery" != "enabled" ]; then + log_message "-e error Unless private link configuration, publicNetworkAccessForQuery MUST be enabled for data query" + log_message ${workspacePrivateLinkMessage} + exit 1 + fi + fi + + workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr -d "[:space:]") + log_message "workspaceCapping dailyQuotaGb:${workspaceCappingDailyQuotaGb}" + if [ "$workspaceCappingDailyQuotaGb" != "-1.0" ]; then + log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota:${workspaceCappingDailyQuotaGb}" + log_message ${dataCapHelpMessage} + exit 1 + fi + + workspaceId=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.customerId -o tsv | tr -d "[:space:]") + log_message "workspaceId: ${workspaceId}" + + workspaceKey=$(az rest --method post --uri $logAnalyticsWorkspaceResourceID/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) + workspacePrimarySharedKey=$(echo $workspaceKey | tr -d '"') + + log_message "END:validate_ci_extension:SUCCESS" +} + +validate_az_cli_installed_or_not() { + if command_exists az; then + log_message "detected azure cli installed" + azCLIVersion=$(az -v) + log_message "azure-cli version: ${azCLIVersion}" + azCLIExtension=$(az extension list --query "[?name=='k8s-extension'].name | [0]") + if [ "$azCLIExtension" = "k8s-extension" ]; then + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + log_message "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" + log_message "updating the k8s-extension version to latest available one" + az extension update --name 'k8s-extension' + else + log_message "adding k8s-extension since k8s-extension doesnt exist as installed" + az extension add --name 'k8s-extension' + fi + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + log_message "current installed k8s-extension version: ${azCLIExtensionVersion}" + else + log_message "-e error azure cli doesnt exist as installed" + log_message ${azureCLIInstallLinkMessage} + exit 1 + fi +} + +validate_ci_agent_pods() { + log_message "START:validate_ci_agent_pods" + # verify the id and key of the workspace matches with workspace key value in the secret + wsID=$(kubectl get secrets ${agentK8sSecretName} -n ${agentK8sNamespace} -o json | jq -r ".data.WSID") + wsID=$(echo $wsID | base64 -d) + log_message "workspaceId: ${wsID} value in the ${agentK8sSecretName}" + + wsKEY=$(kubectl get secrets ${agentK8sSecretName} -n ${agentK8sNamespace} -o json | jq -r ".data.KEY") + wsKEY=$(echo $wsKEY | base64 -d) + + if [[ "$workspaceId" != "$wsID" ]]; then + log_message "-e error workspaceId: ${workspaceID} of the workspace doesnt match with workspaceId: ${wsID} value in the omsagent secret" + log_message $ciExtensionReOnboarding + exit 1 + fi + if [[ "$workspacePrimarySharedKey" != "$wsKEY" ]]; then + log_message "-e error workspacePrimarySharedKey of the workspace doesnt match with workspacekey value value in the omsagent secret" + log_message $ciExtensionReOnboarding + exit 1 + fi + + # verify state of agent deployment + readyReplicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.readyReplicas') + log_message "number of deployment ready replicas:${readyReplicas}" + if [[ "$readyReplicas" != "1" ]]; then + log_message "-e error number of readyReplicas of agent deployment MUST be 1" + exit 1 + fi + replicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.replicas') + log_message "number of deployment replicas:${replicas}" + if [[ "$replicas" != "1" ]]; then + log_message "-e error number of replicas of agent deployment MUST be 1" + exit 1 + fi + + # verify state of agent ds + currentNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.currentNumberScheduled') + desiredNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.desiredNumberScheduled') + log_message "number of linux deamonset pods currentNumberScheduled:${currentNumberScheduled} and currentNumberScheduled:${currentNumberScheduled}" + if [[ "$currentNumberScheduled" != "$desiredNumberScheduled" ]]; then + log_message "-e error desiredNumberScheduled: ${desiredNumberScheduled} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + + numberAvailable=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberAvailable') + log_message "number of linux deamonset pods numberAvailable:${numberAvailable}" + if [[ "$numberAvailable" != "$currentNumberScheduled" ]]; then + log_message "-e error numberAvailable: ${numberAvailable} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + numberReady=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberReady') + log_message "number of linux deamonset pods numberReady:${numberReady}" + if [[ "$numberAvailable" != "$numberReady" ]]; then + log_message "-e error numberAvailable: ${numberAvailable} doesnt match with numberReady: ${numberReady}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + log_message "END:validate_ci_agent_pods:SUCCESS" +} + +validate_ci_agent_identity_status() { + log_message "START:validate_ci_agent_identity_status" + log_message "Info of ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json >> $logFile + status=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status') + if [ -z "$status" ]; then + log_message "-e error status field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + expirationTime=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.expirationTime') + if [ -z "$expirationTime" ]; then + log_message "-e error expirationTime field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + tokenReference=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference') + if [ -z "$tokenReference" ]; then + log_message "-e error tokenReference field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + dataName=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference.dataName') + if [ -z "$dataName" ]; then + log_message "-e error dataName field of tokenReference empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + secretName=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference.secretName') + if [ -z "$secretName" ]; then + log_message "-e error secretName field of tokenReference empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + log_message "END:validate_ci_agent_identity_status:SUCCESS" +} + +get_nodes_pods_crds_info() { + log_message "START:get_nodes_pods_crds_info" + log_message "nodes" + kubectl get nodes >> $logFile + + log_message "kube-system pods" + kubectl get pods -n ${agentK8sNamespace} >> $logFile + + log_message "azurearck8spods" + kubectl get pods -n ${azureArcK8sNamespace} >> $logFile + + log_message "crds" + kubectl get crds -A >> $logFile + + log_message "azureclusteridentityrequests crds" + kubectl get crds azureclusteridentityrequests.clusterconfig.azure.com >> $logFile + kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} >> $logFile + + log_message "container-insights-clusteridentityrequest crd" + kubectl describe azureclusteridentityrequests -n ${azureArcK8sNamespace} container-insights-clusteridentityrequest >> $logFile + log_message "END:get_nodes_pods_crds_info:SUCCESS" +} + +datetime=$(date -u) +log_message "*** Script Execution start @ ${datetime} ***" + +# verify azure cli installed or not +validate_az_cli_installed_or_not + +# parse and validate args +parse_args $@ + +# parse cluster resource id +clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" +clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" +providerName="$(echo $clusterResourceId | cut -d'/' -f7)" +clusterName="$(echo $clusterResourceId | cut -d'/' -f9)" + +# get the current active azure cloud of the az cli +azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") +log_message "azure cloud name: ${azureCloudName}" + +# login to azure interactively +login_to_azure + +# set the cluster subscription id as active sub for azure cli +set_azure_subscription $clusterSubscriptionId + +# validate ci extension +validate_ci_extension $azureCloudName $clusterSubscriptionId $clusterResourceGroup $clusterName + +# validate ci agent pods +if command_exists kubectl; then + if command_exists jq; then + validate_ci_agent_pods + else + log_message "-e error jq doesnt exist as installed" + log_message $jqInstallLinkMessage + exit 1 + fi +else + log_message "-e error kubectl doesnt exist as installed" + log_message ${kubectlInstallLinkMessage} + exit 1 +fi + +# validate ci cluster identity token +validate_ci_agent_identity_status + +# get nodes and pods status +get_nodes_pods_crds_info + +log_message "Everything looks good according to this script." +log_message $contactUSMessage diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 91a5b4b40..407ab3611 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -21,9 +21,10 @@ import ( "github.com/google/uuid" "github.com/tinylib/msgp/msgp" - lumberjack "gopkg.in/natefinch/lumberjack.v2" "Docker-Provider/source/plugins/go/src/extension" + lumberjack "gopkg.in/natefinch/lumberjack.v2" + "github.com/Azure/azure-kusto-go/kusto/ingest" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -85,7 +86,6 @@ const WindowsContainerLogPluginConfFilePath = "/etc/omsagentwindows/out_oms.conf // IPName const IPName = "ContainerInsights" - const defaultContainerInventoryRefreshInterval = 60 const kubeMonAgentConfigEventFlushInterval = 60 @@ -102,9 +102,6 @@ const ContainerLogsV2Route = "v2" const ContainerLogsADXRoute = "adx" -//fallback option v1 route i.e. ODS direct if required in any case -const ContainerLogsV1Route = "v1" - //container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) const ContainerLogV2SchemaVersion = "v2" @@ -117,6 +114,9 @@ const MdsdOutputStreamIdTagPrefix = "dcr-" //env variable to container type const ContainerTypeEnv = "CONTAINER_TYPE" +//Default ADX destination database name, can be overriden through configuration +const DefaultAdxDatabaseName = "containerinsights" + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -166,6 +166,8 @@ var ( AdxTenantID string //ADX client secret AdxClientSecret string + //ADX destination database name, default is DefaultAdxDatabaseName, can be overridden in configuration + AdxDatabaseName string // container log or container log v2 tag name for oneagent route MdsdContainerLogTagName string // kubemonagent events tag name for oneagent route @@ -247,29 +249,29 @@ type DataItemLAv1 struct { // DataItemLAv2 == ContainerLogV2 table in LA // Please keep the names same as destination column names, to avoid transforming one to another in the pipeline type DataItemLAv2 struct { - TimeGenerated string `json:"TimeGenerated"` - Computer string `json:"Computer"` - ContainerId string `json:"ContainerId"` - ContainerName string `json:"ContainerName"` - PodName string `json:"PodName"` - PodNamespace string `json:"PodNamespace"` - LogMessage string `json:"LogMessage"` - LogSource string `json:"LogSource"` + TimeGenerated string `json:"TimeGenerated"` + Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` //PodLabels string `json:"PodLabels"` } // DataItemADX == ContainerLogV2 table in ADX type DataItemADX struct { - TimeGenerated string `json:"TimeGenerated"` - Computer string `json:"Computer"` - ContainerId string `json:"ContainerId"` - ContainerName string `json:"ContainerName"` - PodName string `json:"PodName"` - PodNamespace string `json:"PodNamespace"` - LogMessage string `json:"LogMessage"` - LogSource string `json:"LogSource"` + TimeGenerated string `json:"TimeGenerated"` + Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` //PodLabels string `json:"PodLabels"` - AzureResourceId string `json:"AzureResourceId"` + AzureResourceId string `json:"AzureResourceId"` } // telegraf metric DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin @@ -294,15 +296,15 @@ type InsightsMetricsBlob struct { // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type ContainerLogBlobLAv1 struct { - DataType string `json:"DataType"` - IPName string `json:"IPName"` + DataType string `json:"DataType"` + IPName string `json:"IPName"` DataItems []DataItemLAv1 `json:"DataItems"` } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type ContainerLogBlobLAv2 struct { - DataType string `json:"DataType"` - IPName string `json:"IPName"` + DataType string `json:"DataType"` + IPName string `json:"IPName"` DataItems []DataItemLAv2 `json:"DataItems"` } @@ -356,6 +358,7 @@ const ( // DataType to be used as enum per data type socket client creation type DataType int + const ( // DataType to be used as enum per data type socket client creation ContainerLogV2 DataType = iota @@ -623,12 +626,12 @@ func flushKubeMonAgentEventRecords() { Log(message) SendException(message) } else { - msgPackEntry := MsgPackEntry{ + msgPackEntry := MsgPackEntry{ Record: stringMap, } - msgPackEntries = append(msgPackEntries, msgPackEntry) - } - } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } @@ -665,8 +668,8 @@ func flushKubeMonAgentEventRecords() { msgPackEntry := MsgPackEntry{ Record: stringMap, } - msgPackEntries = append(msgPackEntries, msgPackEntry) - } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } } } } @@ -708,18 +711,18 @@ func flushKubeMonAgentEventRecords() { } else { if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { message := fmt.Sprintf("Error while UnMarshalling json bytes to stringmap: %s", err.Error()) - Log(message) - SendException(message) + Log(message) + SendException(message) } else { msgPackEntry := MsgPackEntry{ Record: stringMap, - } - msgPackEntries = append(msgPackEntries, msgPackEntry) + } + msgPackEntries = append(msgPackEntries, msgPackEntry) } } } } - if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + if IsWindows == false && len(msgPackEntries) > 0 { //for linux, mdsd route if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdKubeMonAgentEventsTagName, MdsdOutputStreamIdTagPrefix) == false { Log("Info::mdsd::obtaining output stream id for data type: %s", KubeMonAgentEventDataType) MdsdKubeMonAgentEventsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(KubeMonAgentEventDataType) @@ -752,7 +755,7 @@ func flushKubeMonAgentEventRecords() { } else { numRecords := len(msgPackEntries) Log("FlushKubeMonAgentEventRecords::Info::Successfully flushed %d records that was %d bytes in %s", numRecords, bts, elapsed) - // Send telemetry to AppInsights resource + // Send telemetry to AppInsights resource SendEvent(KubeMonAgentEventsFlushedEvent, telemetryDimensions) } } else { @@ -783,8 +786,8 @@ func flushKubeMonAgentEventRecords() { if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() - ingestionAuthToken := ODSIngestionAuthToken - IngestionAuthTokenUpdateMutex.Unlock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() if ingestionAuthToken == "" { Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") } @@ -905,86 +908,90 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int var msgPackEntries []MsgPackEntry var i int start := time.Now() - var elapsed time.Duration + var elapsed time.Duration for i = 0; i < len(laMetrics); i++ { - var interfaceMap map[string]interface{} - stringMap := make(map[string]string) - jsonBytes, err := json.Marshal(*laMetrics[i]) - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + var interfaceMap map[string]interface{} + stringMap := make(map[string]string) + jsonBytes, err := json.Marshal(*laMetrics[i]) + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } else { + if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { + message := fmt.Sprintf("Error while UnMarshalling json bytes to interfaceMap: %s", err.Error()) Log(message) SendException(message) return output.FLB_OK } else { - if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { - message := fmt.Sprintf("Error while UnMarshalling json bytes to interfaceMap: %s", err.Error()) - Log(message) - SendException(message) - return output.FLB_OK - } else { - for key, value := range interfaceMap { - strKey := fmt.Sprintf("%v", key) - strValue := fmt.Sprintf("%v", value) - stringMap[strKey] = strValue - } - msgPackEntry := MsgPackEntry{ - Record: stringMap, - } - msgPackEntries = append(msgPackEntries, msgPackEntry) + for key, value := range interfaceMap { + strKey := fmt.Sprintf("%v", key) + strValue := fmt.Sprintf("%v", value) + stringMap[strKey] = strValue } + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) } + } } - if (len(msgPackEntries) > 0) { - if IsAADMSIAuthMode == true && (strings.HasPrefix(MdsdInsightsMetricsTagName, MdsdOutputStreamIdTagPrefix) == false) { - Log("Info::mdsd::obtaining output stream id for InsightsMetricsDataType since Log Analytics AAD MSI Auth Enabled") - MdsdInsightsMetricsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(InsightsMetricsDataType) - } - msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) + if len(msgPackEntries) > 0 { + if IsAADMSIAuthMode == true && (strings.HasPrefix(MdsdInsightsMetricsTagName, MdsdOutputStreamIdTagPrefix) == false) { + Log("Info::mdsd::obtaining output stream id for InsightsMetricsDataType since Log Analytics AAD MSI Auth Enabled") + MdsdInsightsMetricsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(InsightsMetricsDataType) + } + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) + if MdsdInsightsMetricsMsgpUnixSocketClient == nil { + Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") + CreateMDSDClient(InsightsMetrics, ContainerType) if MdsdInsightsMetricsMsgpUnixSocketClient == nil { - Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") - CreateMDSDClient(InsightsMetrics, ContainerType) - if MdsdInsightsMetricsMsgpUnixSocketClient == nil { - Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") - ContainerLogTelemetryMutex.Lock() - defer ContainerLogTelemetryMutex.Unlock() - InsightsMetricsMDSDClientCreateErrors += 1 - return output.FLB_RETRY - } - } - - deadline := 10 * time.Second - MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse - bts, er := MdsdInsightsMetricsMsgpUnixSocketClient.Write(msgpBytes) - - elapsed = time.Since(start) - - if er != nil { - Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) - if MdsdInsightsMetricsMsgpUnixSocketClient != nil { - MdsdInsightsMetricsMsgpUnixSocketClient.Close() - MdsdInsightsMetricsMsgpUnixSocketClient = nil - } - + Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() InsightsMetricsMDSDClientCreateErrors += 1 return output.FLB_RETRY - } else { - numTelegrafMetricsRecords := len(msgPackEntries) - UpdateNumTelegrafMetricsSentTelemetry(numTelegrafMetricsRecords, 0, 0) - Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) } + } + + deadline := 10 * time.Second + MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + bts, er := MdsdInsightsMetricsMsgpUnixSocketClient.Write(msgpBytes) + + elapsed = time.Since(start) + + if er != nil { + Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0, 0) + if MdsdInsightsMetricsMsgpUnixSocketClient != nil { + MdsdInsightsMetricsMsgpUnixSocketClient.Close() + MdsdInsightsMetricsMsgpUnixSocketClient = nil + } + + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + InsightsMetricsMDSDClientCreateErrors += 1 + return output.FLB_RETRY + } else { + numTelegrafMetricsRecords := len(msgPackEntries) + UpdateNumTelegrafMetricsSentTelemetry(numTelegrafMetricsRecords, 0, 0, 0) + Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) + } } } else { // for windows, ODS direct var metrics []laTelegrafMetric var i int + numWinMetricsWithTagsSize64KBorMore := 0 for i = 0; i < len(laMetrics); i++ { metrics = append(metrics, *laMetrics[i]) + if len(*&laMetrics[i].Tags) >= (64 * 1024) { + numWinMetricsWithTagsSize64KBorMore += 1 + } } laTelegrafMetrics := InsightsMetricsBlob{ @@ -1036,7 +1043,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if err != nil { message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) Log(message) - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0, 0) return output.FLB_RETRY } @@ -1045,7 +1052,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int Log("PostTelegrafMetricsToLA::Error:(retriable) RequestID %s Response Status %v Status Code %v", reqID, resp.Status, resp.StatusCode) } if resp != nil && resp.StatusCode == 429 { - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1, 0) } return output.FLB_RETRY } @@ -1053,18 +1060,19 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int defer resp.Body.Close() numMetrics := len(laMetrics) - UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0) + UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0, numWinMetricsWithTagsSize64KBorMore) Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) } return output.FLB_OK } -func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int, numSend429Errors int) { +func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int, numSend429Errors int, numWinMetricswith64KBorMoreSize int) { ContainerLogTelemetryMutex.Lock() TelegrafMetricsSentCount += float64(numMetricsSent) TelegrafMetricsSendErrorCount += float64(numSendErrors) TelegrafMetricsSend429ErrorCount += float64(numSend429Errors) + WinTelegrafMetricsCountWithTagsSize64KBorMore += float64(numWinMetricswith64KBorMoreSize) ContainerLogTelemetryMutex.Unlock() } @@ -1112,12 +1120,12 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap = make(map[string]string) //below id & name are used by latency telemetry in both v1 & v2 LA schemas id := "" - name := "" + name := "" logEntry := ToString(record["log"]) logEntryTimeStamp := ToString(record["time"]) //ADX Schema & LAv2 schema are almost the same (except resourceId) - if (ContainerLogSchemaV2 == true || ContainerLogsRouteADX == true) { + if ContainerLogSchemaV2 == true || ContainerLogsRouteADX == true { stringMap["Computer"] = Computer stringMap["ContainerId"] = containerID stringMap["ContainerName"] = containerName @@ -1166,29 +1174,29 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap["AzureResourceId"] = "" } dataItemADX = DataItemADX{ - TimeGenerated: stringMap["TimeGenerated"], - Computer: stringMap["Computer"], - ContainerId: stringMap["ContainerId"], - ContainerName: stringMap["ContainerName"], - PodName: stringMap["PodName"], - PodNamespace: stringMap["PodNamespace"], - LogMessage: stringMap["LogMessage"], - LogSource: stringMap["LogSource"], - AzureResourceId: stringMap["AzureResourceId"], + TimeGenerated: stringMap["TimeGenerated"], + Computer: stringMap["Computer"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], + AzureResourceId: stringMap["AzureResourceId"], } //ADX dataItemsADX = append(dataItemsADX, dataItemADX) } else { - if (ContainerLogSchemaV2 == true) { + if ContainerLogSchemaV2 == true { dataItemLAv2 = DataItemLAv2{ - TimeGenerated: stringMap["TimeGenerated"], - Computer: stringMap["Computer"], - ContainerId: stringMap["ContainerId"], - ContainerName: stringMap["ContainerName"], - PodName: stringMap["PodName"], - PodNamespace: stringMap["PodNamespace"], - LogMessage: stringMap["LogMessage"], - LogSource: stringMap["LogSource"], + TimeGenerated: stringMap["TimeGenerated"], + Computer: stringMap["Computer"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], } //ODS-v2 schema dataItemsLAv2 = append(dataItemsLAv2, dataItemLAv2) @@ -1206,10 +1214,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { Image: stringMap["Image"], Name: stringMap["Name"], } - //ODS-v1 schema - dataItemsLAv1 = append(dataItemsLAv1, dataItemLAv1) - name = stringMap["Name"] - id = stringMap["Id"] + //ODS-v1 schema + dataItemsLAv1 = append(dataItemsLAv1, dataItemLAv1) + name = stringMap["Name"] + id = stringMap["Id"] } } @@ -1226,6 +1234,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { maxLatencyContainer = name + "=" + id } } + } else { + ContainerLogTelemetryMutex.Lock() + ContainerLogRecordCountWithEmptyTimeStamp += 1 + ContainerLogTelemetryMutex.Unlock() } } @@ -1359,18 +1371,18 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = len(dataItemsADX) Log("Success::ADX::Successfully wrote %d container log records to ADX in %s", numContainerLogRecords, elapsed) - } else if ((ContainerLogSchemaV2 == true && len(dataItemsLAv2) > 0) || len(dataItemsLAv1) > 0) { //ODS + } else if (ContainerLogSchemaV2 == true && len(dataItemsLAv2) > 0) || len(dataItemsLAv1) > 0 { //ODS var logEntry interface{} recordType := "" loglinesCount := 0 //schema v2 - if (len(dataItemsLAv2) > 0 && ContainerLogSchemaV2 == true) { + if len(dataItemsLAv2) > 0 && ContainerLogSchemaV2 == true { logEntry = ContainerLogBlobLAv2{ DataType: ContainerLogV2DataType, IPName: IPName, DataItems: dataItemsLAv2} - loglinesCount = len(dataItemsLAv2) - recordType = "ContainerLogV2" + loglinesCount = len(dataItemsLAv2) + recordType = "ContainerLogV2" } else { //schema v1 if len(dataItemsLAv1) > 0 { @@ -1378,8 +1390,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataType: ContainerLogDataType, IPName: IPName, DataItems: dataItemsLAv1} - loglinesCount = len(dataItemsLAv1) - recordType = "ContainerLog" + loglinesCount = len(dataItemsLAv1) + recordType = "ContainerLog" } } @@ -1411,7 +1423,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { return output.FLB_RETRY } // add authorization header to the req - req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) } resp, err := HTTPClient.Do(req) @@ -1439,7 +1451,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = loglinesCount Log("PostDataHelper::Info::Successfully flushed %d %s records to ODS in %s", numContainerLogRecords, recordType, elapsed) - } + } ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() @@ -1553,7 +1565,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Container Type %s", ContainerType) osType := os.Getenv("OS_TYPE") - IsWindows = false + IsWindows = false // Linux if strings.Compare(strings.ToLower(osType), "windows") != 0 { Log("Reading configuration for Linux from %s", pluginConfPath) @@ -1698,6 +1710,17 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { ContainerLogsRouteADX = false if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { + // Try to read the ADX database name from environment variables. Default to DefaultAdsDatabaseName if not set. + // This SHOULD be set by tomlparser.rb so it's a highly unexpected event if it isn't. + // It should be set by the logic in tomlparser.rb EVEN if ADX logging isn't enabled + AdxDatabaseName := strings.TrimSpace(os.Getenv("AZMON_ADX_DATABASE_NAME")) + + // Check the len of the provided name for database and use default if 0, just to be sure + if len(AdxDatabaseName) == 0 { + Log("Adx database name unexpecedly empty (check config AND implementation, should have been set by tomlparser.rb?) - will default to '%s'", DefaultAdxDatabaseName) + AdxDatabaseName = DefaultAdxDatabaseName + } + //check if adx clusteruri, clientid & secret are set var err error AdxClusterUri, err = ReadFileContents(PluginConfiguration["adx_cluster_uri_path"]) @@ -1708,6 +1731,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Invalid AdxClusterUri %s", AdxClusterUri) AdxClusterUri = "" } + AdxClientID, err = ReadFileContents(PluginConfiguration["adx_client_id_path"]) if err != nil { Log("Error when reading AdxClientID %s", err) @@ -1723,16 +1747,14 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Error when reading AdxClientSecret %s", err) } - if len(AdxClusterUri) > 0 && len(AdxClientID) > 0 && len(AdxClientSecret) > 0 && len(AdxTenantID) > 0 { + // AdxDatabaseName should never get in a state where its length is 0, but it doesn't hurt to add the check + if len(AdxClusterUri) > 0 && len(AdxClientID) > 0 && len(AdxClientSecret) > 0 && len(AdxTenantID) > 0 && len(AdxDatabaseName) > 0 { ContainerLogsRouteADX = true Log("Routing container logs thru %s route...", ContainerLogsADXRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route...\n", ContainerLogsADXRoute) } } else if strings.Compare(strings.ToLower(osType), "windows") != 0 { //for linux, oneagent will be default route - ContainerLogsRouteV2 = true //default is mdsd route - if strings.Compare(ContainerLogsRoute, ContainerLogsV1Route) == 0 { - ContainerLogsRouteV2 = false //fallback option when hiddensetting set - } + ContainerLogsRouteV2 = true //default is mdsd route Log("Routing container logs thru %s route...", ContainerLogsRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsRoute) } @@ -1750,14 +1772,14 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Creating MDSD clients for KubeMonAgentEvents & InsightsMetrics") CreateMDSDClient(KubeMonAgentEvents, ContainerType) CreateMDSDClient(InsightsMetrics, ContainerType) - } + } ContainerLogSchemaVersion := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOG_SCHEMA_VERSION"))) Log("AZMON_CONTAINER_LOG_SCHEMA_VERSION:%s", ContainerLogSchemaVersion) - ContainerLogSchemaV2 = false //default is v1 schema + ContainerLogSchemaV2 = false //default is v1 schema - if strings.Compare(ContainerLogSchemaVersion, ContainerLogV2SchemaVersion) == 0 && ContainerLogsRouteADX != true { + if strings.Compare(ContainerLogSchemaVersion, ContainerLogV2SchemaVersion) == 0 && ContainerLogsRouteADX != true { ContainerLogSchemaV2 = true Log("Container logs schema=%s", ContainerLogV2SchemaVersion) fmt.Fprintf(os.Stdout, "Container logs schema=%s... \n", ContainerLogV2SchemaVersion) @@ -1783,15 +1805,15 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { if ContainerLogSchemaV2 == true { MdsdContainerLogTagName = MdsdContainerLogV2SourceName } else { - MdsdContainerLogTagName = MdsdContainerLogSourceName - } + MdsdContainerLogTagName = MdsdContainerLogSourceName + } MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName - MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName Log("ContainerLogsRouteADX: %v, IsWindows: %v, IsAADMSIAuthMode = %v \n", ContainerLogsRouteADX, IsWindows, IsAADMSIAuthMode) if !ContainerLogsRouteADX && IsWindows && IsAADMSIAuthMode { Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) - IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) go refreshIngestionAuthToken() } } diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 31818dbb3..b4f8ab89d 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -32,6 +32,8 @@ var ( TelemetryClient appinsights.TelemetryClient // ContainerLogTelemetryTicker sends telemetry periodically ContainerLogTelemetryTicker *time.Ticker + //Tracks the number of windows telegraf metrics count with Tags size 64KB or more between telemetry ticker periods (uses ContainerLogTelemetryTicker) + WinTelegrafMetricsCountWithTagsSize64KBorMore float64 //Tracks the number of telegraf metrics sent successfully between telemetry ticker periods (uses ContainerLogTelemetryTicker) TelegrafMetricsSentCount float64 //Tracks the number of send errors between telemetry ticker periods (uses ContainerLogTelemetryTicker) @@ -50,6 +52,8 @@ var ( ContainerLogsSendErrorsToADXFromFluent float64 //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsADXClientCreateErrors float64 + //Tracks the number of container log records with empty Timestamp (uses ContainerLogTelemetryTicker) + ContainerLogRecordCountWithEmptyTimeStamp float64 //Tracks the number of OSM namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) OSMNamespaceCount int //Tracks whether monitor kubernetes pods is set to true and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) @@ -76,12 +80,14 @@ const ( metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" + metricNameNumberofWinTelegrafMetricsWithTagsSize64KBorMore = "WinTelegrafMetricsCountWithTagsSize64KBorMore" metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" metricNameErrorCountInsightsMetricsMDSDClientCreateError = "InsightsMetricsMDSDClientCreateErrorsCount" metricNameErrorCountKubeMonEventsMDSDClientCreateError = "KubeMonEventsMDSDClientCreateErrorsCount" metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" + metricNameContainerLogRecordCountWithEmptyTimeStamp = "ContainerLogRecordCountWithEmptyTimeStamp" defaultTelemetryPushIntervalSeconds = 300 @@ -114,6 +120,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { telegrafMetricsSentCount := TelegrafMetricsSentCount telegrafMetricsSendErrorCount := TelegrafMetricsSendErrorCount telegrafMetricsSend429ErrorCount := TelegrafMetricsSend429ErrorCount + winTelegrafMetricsCountWithTagsSize64KBorMore := WinTelegrafMetricsCountWithTagsSize64KBorMore containerLogsSendErrorsToMDSDFromFluent := ContainerLogsSendErrorsToMDSDFromFluent containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent @@ -125,10 +132,12 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength promMonitorPodsLabelSelectorLength := PromMonitorPodsLabelSelectorLength promMonitorPodsFieldSelectorLength := PromMonitorPodsFieldSelectorLength + containerLogRecordCountWithEmptyTimeStamp := ContainerLogRecordCountWithEmptyTimeStamp TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 TelegrafMetricsSend429ErrorCount = 0.0 + WinTelegrafMetricsCountWithTagsSize64KBorMore = 0.0 FlushedRecordsCount = 0.0 FlushedRecordsSize = 0.0 FlushedRecordsTimeTaken = 0.0 @@ -142,6 +151,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogsADXClientCreateErrors = 0.0 InsightsMetricsMDSDClientCreateErrors = 0.0 KubeMonEventsMDSDClientCreateErrors = 0.0 + ContainerLogRecordCountWithEmptyTimeStamp = 0.0 ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { @@ -222,6 +232,12 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { if kubeMonEventsMDSDClientCreateErrors > 0.0 { TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountKubeMonEventsMDSDClientCreateError, kubeMonEventsMDSDClientCreateErrors)) } + if winTelegrafMetricsCountWithTagsSize64KBorMore > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofWinTelegrafMetricsWithTagsSize64KBorMore, winTelegrafMetricsCountWithTagsSize64KBorMore)) + } + if ContainerLogRecordCountWithEmptyTimeStamp > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameContainerLogRecordCountWithEmptyTimeStamp, containerLogRecordCountWithEmptyTimeStamp)) + } start = time.Now() } diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 73c8cf6d3..9ca8980db 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -192,7 +192,7 @@ func CreateADXClient() { //log.Fatalf("Unable to create ADX connection %s", err.Error()) } else { Log("Successfully created ADX Client. Creating Ingestor...") - ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogV2") + ingestor, ingestorErr := ingest.New(client, AdxDatabaseName, "ContainerLogV2") if ingestorErr != nil { Log("Error::mdsd::Unable to create ADX ingestor %s", ingestorErr.Error()) } else { diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index 7691304a6..eb143c4ba 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -98,6 +98,13 @@ def initializeUtility() elsif !encodedAppInsightsKey.nil? decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) + if @@isWindows + logPath = "/etc/omsagentwindows/appinsights_error.log" + else + logPath = "/var/opt/microsoft/docker-cimprov/log/appinsights_error.log" + end + aiLogger = Logger.new(logPath, 1, 2 * 1024 * 1024) + #override ai endpoint if its available otherwise use default. if appInsightsEndpoint && !appInsightsEndpoint.nil? && !appInsightsEndpoint.empty? $log.info("AppInsightsUtility: Telemetry client uses overrided endpoint url : #{appInsightsEndpoint}") @@ -105,20 +112,20 @@ def initializeUtility() #telemetrySynchronousQueue = ApplicationInsights::Channel::SynchronousQueue.new(telemetrySynchronousSender) #telemetryChannel = ApplicationInsights::Channel::TelemetryChannel.new nil, telemetrySynchronousQueue if !isProxyConfigured - sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint + sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint, aiLogger else $log.info("AppInsightsUtility: Telemetry client uses provided proxy configuration since proxy configured") - sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint, @@proxy + sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint, aiLogger, @@proxy end queue = ApplicationInsights::Channel::AsynchronousQueue.new sender channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey, channel else if !isProxyConfigured - sender = ApplicationInsights::Channel::AsynchronousSender.new + sender = ApplicationInsights::Channel::AsynchronousSender.new nil, aiLogger else $log.info("AppInsightsUtility: Telemetry client uses provided proxy configuration since proxy configured") - sender = ApplicationInsights::Channel::AsynchronousSender.new nil, @@proxy + sender = ApplicationInsights::Channel::AsynchronousSender.new nil, aiLogger, @@proxy end queue = ApplicationInsights::Channel::AsynchronousQueue.new sender channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 63f43eaf1..f3b1ccf57 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -235,7 +235,6 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - metricCollection = {} metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue @@ -715,7 +714,6 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE metricItem["InstanceName"] = clusterId + "/" + nodeName - metricCollection = {} metricCollection["CounterName"] = metricNametoReturn metricCollection["Value"] = metricValue @@ -861,13 +859,11 @@ def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn, metric metricValue = node["startTime"] metricTime = metricPollTime #Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricItem["Timestamp"] = metricTime metricItem["Host"] = hostName metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE metricItem["InstanceName"] = clusterId + "/" + nodeName - metricCollection = {} metricCollection["CounterName"] = metricNametoReturn #Read it from /proc/uptime diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 4afb3d961..787744cee 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -88,7 +88,7 @@ def getTokenStr end end - def getClusterRegion(env=ENV) + def getClusterRegion(env = ENV) if env["AKS_REGION"] return env["AKS_REGION"] else @@ -97,7 +97,7 @@ def getClusterRegion(env=ENV) end end - def getResourceUri(resource, api_group, env=ENV) + def getResourceUri(resource, api_group, env = ENV) begin if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] if api_group.nil? @@ -114,7 +114,7 @@ def getResourceUri(resource, api_group, env=ENV) end end - def getClusterName(env=ENV) + def getClusterName(env = ENV) return @@ClusterName if !@@ClusterName.nil? @@ClusterName = "None" begin @@ -148,7 +148,7 @@ def getClusterName(env=ENV) return @@ClusterName end - def getClusterId(env=ENV) + def getClusterId(env = ENV) return @@ClusterId if !@@ClusterId.nil? #By default initialize ClusterId to ClusterName. # In ACS/On-prem, we need to figure out how we can generate ClusterId @@ -778,7 +778,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil) return continuationToken, resourceInventory end #getResourcesAndContinuationToken - def getKubeAPIServerUrl(env=ENV) + def getKubeAPIServerUrl(env = ENV) apiServerUrl = nil begin if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 7c3e858dd..b9516c2ce 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -119,7 +119,7 @@ class Constants KUBE_MON_AGENT_EVENTS_DATA_TYPE = "KUBE_MON_AGENT_EVENTS_BLOB" KUBE_HEALTH_DATA_TYPE = "KUBE_HEALTH_BLOB" CONTAINERLOGV2_DATA_TYPE = "CONTAINERINSIGHTS_CONTAINERLOGV2" - CONTAINERLOG_DATA_TYPE = "CONTAINER_LOG_BLOB" + CONTAINERLOG_DATA_TYPE = "CONTAINER_LOG_BLOB" #ContainerInsights Extension (AMCS) CI_EXTENSION_NAME = "ContainerInsights" @@ -132,4 +132,8 @@ class Constants LINUX_LOG_PATH = $in_unit_test.nil? ? "/var/opt/microsoft/docker-cimprov/log/" : "./" WINDOWS_LOG_PATH = $in_unit_test.nil? ? "/etc/omsagentwindows/" : "./" + + #This is for telemetry to track if any of the windows customer has any of the field size >= 64KB + #To evaluate switching to Windows AMA 64KB impacts any existing customers + MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536 end diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 4ed0d5bde..32b3a3884 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -292,7 +292,7 @@ def filter(tag, time, record) end end - def filterPVInsightsMetrics(record) + def filterPVInsightsMetrics(record) begin mdmMetrics = [] if record["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(record["Name"].downcase) @@ -357,7 +357,7 @@ def ensure_cpu_memory_capacity_and_allocatable_set if !nodeInventory.nil? cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") if !cpu_capacity_json.nil? - metricVal = JSON.parse(cpu_capacity_json[0]["json_Collections"])[0]["Value"] + metricVal = JSON.parse(cpu_capacity_json[0]["json_Collections"])[0]["Value"] if !metricVal.to_s.nil? @cpu_capacity = metricVal @log.info "CPU Limit #{@cpu_capacity}" diff --git a/source/plugins/ruby/filter_health_model_builder.rb b/source/plugins/ruby/filter_health_model_builder.rb index 4c6bcb1c1..3262935aa 100644 --- a/source/plugins/ruby/filter_health_model_builder.rb +++ b/source/plugins/ruby/filter_health_model_builder.rb @@ -2,282 +2,276 @@ # frozen_string_literal: true -require 'fluent/plugin/filter' +require "fluent/plugin/filter" module Fluent::Plugin - require_relative 'extension_utils' - require 'logger' - require 'yajl/json_gem' - Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } - - - class FilterHealthModelBuilder < Filter - include HealthModel - Fluent::Plugin.register_filter('health_model_builder', self) - - config_param :enable_log, :integer, :default => 0 - config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log' - config_param :model_definition_path, :default => '/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json' - config_param :health_monitor_config_path, :default => '/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json' - config_param :health_state_serialized_path, :default => '/mnt/azure/health_model_state.json' - attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator, :telemetry - - - @@cluster_id = KubernetesApiClient.getClusterId - @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" - @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled - - def initialize - begin - super - @rewrite_tag = 'oneagent.containerInsights.KUBE_HEALTH_BLOB' - @buffer = HealthModel::HealthModelBuffer.new - @cluster_health_state = ClusterHealthState.new(@@token_file_path, @@cert_file_path) - @health_model_definition = HealthModel::ParentMonitorProvider.new(HealthModel::HealthModelDefinitionParser.new(@model_definition_path).parse_file) - @monitor_factory = HealthModel::MonitorFactory.new - @hierarchy_builder = HealthHierarchyBuilder.new(@health_model_definition, @monitor_factory) - # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side - @state_finalizers = [HealthModel::AggregateMonitorStateFinalizer.new] - @monitor_set = HealthModel::MonitorSet.new - @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) - @kube_api_down_handler = HealthKubeApiDownHandler.new - @resources = HealthKubernetesResources.instance - @reducer = HealthSignalReducer.new - @generator = HealthMissingSignalGenerator.new - @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) - @cluster_old_state = 'none' - @cluster_new_state = 'none' - @container_cpu_memory_records = [] - @telemetry = HealthMonitorTelemetry.new - @state = HealthMonitorState.new - # move network calls to the end. This will ensure all the instance variables get initialized - if @@cluster_health_model_enabled - deserialized_state_info = @cluster_health_state.get_state - @state.initialize_state(deserialized_state_info) - end - rescue => e - ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) - end + require_relative "extension_utils" + require "logger" + require "yajl/json_gem" + Dir[File.join(__dir__, "./health", "*.rb")].each { |file| require file } + + class FilterHealthModelBuilder < Filter + include HealthModel + Fluent::Plugin.register_filter("health_model_builder", self) + + config_param :enable_log, :integer, :default => 0 + config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log" + config_param :model_definition_path, :default => "/etc/opt/microsoft/docker-cimprov/health/health_model_definition.json" + config_param :health_monitor_config_path, :default => "/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json" + config_param :health_state_serialized_path, :default => "/mnt/azure/health_model_state.json" + attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator, :telemetry + + @@cluster_id = KubernetesApiClient.getClusterId + @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + @@cluster_health_model_enabled = HealthMonitorUtils.is_cluster_health_model_enabled + + def initialize + begin + super + @rewrite_tag = "oneagent.containerInsights.KUBE_HEALTH_BLOB" + @buffer = HealthModel::HealthModelBuffer.new + @cluster_health_state = ClusterHealthState.new(@@token_file_path, @@cert_file_path) + @health_model_definition = HealthModel::ParentMonitorProvider.new(HealthModel::HealthModelDefinitionParser.new(@model_definition_path).parse_file) + @monitor_factory = HealthModel::MonitorFactory.new + @hierarchy_builder = HealthHierarchyBuilder.new(@health_model_definition, @monitor_factory) + # TODO: Figure out if we need to add NodeMonitorHierarchyReducer to the list of finalizers. For now, dont compress/optimize, since it becomes impossible to construct the model on the UX side + @state_finalizers = [HealthModel::AggregateMonitorStateFinalizer.new] + @monitor_set = HealthModel::MonitorSet.new + @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) + @kube_api_down_handler = HealthKubeApiDownHandler.new + @resources = HealthKubernetesResources.instance + @reducer = HealthSignalReducer.new + @generator = HealthMissingSignalGenerator.new + @provider = HealthMonitorProvider.new(@@cluster_id, HealthMonitorUtils.get_cluster_labels, @resources, @health_monitor_config_path) + @cluster_old_state = "none" + @cluster_new_state = "none" + @container_cpu_memory_records = [] + @telemetry = HealthMonitorTelemetry.new + @state = HealthMonitorState.new + # move network calls to the end. This will ensure all the instance variables get initialized + if @@cluster_health_model_enabled + deserialized_state_info = @cluster_health_state.get_state + @state.initialize_state(deserialized_state_info) end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, { "FeatureArea" => "Health" }) + end + end - def configure(conf) - begin - super - @log = nil - if @enable_log - @log = Logger.new(@log_path, 'weekly') - @log.info 'Starting filter_health_model_builder plugin' - end - rescue => e - ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) - end + def configure(conf) + begin + super + @log = nil + if @enable_log + @log = Logger.new(@log_path, "weekly") + @log.info "Starting filter_health_model_builder plugin" end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, { "FeatureArea" => "Health" }) + end + end - def start - super - end + def start + super + end - def shutdown - super + def shutdown + super + end + + def filter_stream(tag, es) + if !@@cluster_health_model_enabled + @log.info "Cluster Health Model disabled in filter_health_model_builder" + return Fluent::MultiEventStream.new + end + begin + new_es = Fluent::MultiEventStream.new + time = Time.now + if ExtensionUtils.isAADMSIAuthMode() + $log.info("filter_health_model_builder::enumerate: AAD AUTH MSI MODE") + if @rewrite_tag.nil? || !@rewrite_tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @rewrite_tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_HEALTH_DATA_TYPE) + end + $log.info("filter_health_model_builder::filter_stream: using tag -#{@rewrite_tag} @ #{Time.now.utc.iso8601}") end - def filter_stream(tag, es) - if !@@cluster_health_model_enabled - @log.info "Cluster Health Model disabled in filter_health_model_builder" - return Fluent::MultiEventStream.new + if tag.start_with?("kubehealth.DaemonSet.Node") + node_records = [] + if !es.nil? + es.each { |time, record| + node_records.push(record) + } + @buffer.add_to_buffer(node_records) + end + return Fluent::MultiEventStream.new + elsif tag.start_with?("kubehealth.DaemonSet.Container") + container_records = [] + if !es.nil? + es.each { |time, record| + container_records.push(record) + } + end + container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) + if @container_cpu_memory_records.nil? + @log.info "@container_cpu_memory_records was not initialized" + @container_cpu_memory_records = [] #in some clusters, this is null, so initialize it again. + end + @container_cpu_memory_records.push(*container_records) # push the records for aggregation later + return Fluent::MultiEventStream.new + elsif tag.start_with?("kubehealth.ReplicaSet") + records = [] + es.each { |time, record| + records.push(record) + } + @buffer.add_to_buffer(records) # in_kube_health records + + aggregated_container_records = [] + if !@container_cpu_memory_records.nil? && !@container_cpu_memory_records.empty? + container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) + deduped_records = container_records_aggregator.dedupe_records(@container_cpu_memory_records) + container_records_aggregator.aggregate(deduped_records) + container_records_aggregator.compute_state + aggregated_container_records = container_records_aggregator.get_records + end + @buffer.add_to_buffer(aggregated_container_records) #container cpu/memory records + records_to_process = @buffer.get_buffer + @buffer.reset_buffer + @container_cpu_memory_records = [] + + health_monitor_records = [] + records_to_process.each do |record| + monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] + monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] + #HealthMonitorRecord + health_monitor_record = HealthMonitorRecord.new( + record[HealthMonitorRecordFields::MONITOR_ID], + record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], + record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], + record[HealthMonitorRecordFields::DETAILS]["state"], + @provider.get_labels(record), + @provider.get_config(monitor_id), + record[HealthMonitorRecordFields::DETAILS] + ) + health_monitor_records.push(health_monitor_record) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + end + + @log.info "health_monitor_records.size #{health_monitor_records.size}" + # Dedupe daemonset signals + # Remove unit monitor signals for “gone” objects + # update state for the reduced set of signals + reduced_records = @reducer.reduce_signals(health_monitor_records, @resources) + reduced_records.each { |record| + @state.update_state(record, + @provider.get_config(record.monitor_id), + false, + @telemetry) + # get the health state based on the monitor's operational state + # update state calls updates the state of the monitor based on configuration and history of the the monitor records + record.state = @state.get_state(record.monitor_instance_id).new_state + } + @log.info "after deduping and removing gone objects reduced_records.size #{reduced_records.size}" + + reduced_records = @kube_api_down_handler.handle_kube_api_down(reduced_records) + @log.info "after kube api down handler health_monitor_records.size #{health_monitor_records.size}" + + #get the list of 'none' and 'unknown' signals + missing_signals = @generator.get_missing_signals(@@cluster_id, reduced_records, @resources, @provider) + + @log.info "after getting missing signals missing_signals.size #{missing_signals.size}" + #update state for missing signals + missing_signals.each { |signal| + @state.update_state(signal, @provider.get_config(signal.monitor_id), false, @telemetry) + @log.info "After Updating #{@state.get_state(signal.monitor_instance_id)} #{@state.get_state(signal.monitor_instance_id).new_state}" + # for unknown/none records, update the "monitor state" to be the latest state (new_state) of the monitor instance from the state + signal.state = @state.get_state(signal.monitor_instance_id).new_state + } + + @generator.update_last_received_records(reduced_records) + all_records = reduced_records.clone + all_records.push(*missing_signals) + + @log.info "after Adding missing signals all_records.size #{all_records.size}" + + HealthMonitorHelpers.add_agentpool_node_label_if_not_present(all_records) + + # build the health model + @model_builder.process_records(all_records) + all_monitors = @model_builder.finalize_model + + @log.info "after building health_model #{all_monitors.size}" + + # update the state for aggregate monitors (unit monitors are updated above) + all_monitors.each { |monitor_instance_id, monitor| + if monitor.is_aggregate_monitor + @state.update_state(monitor, + @provider.get_config(monitor.monitor_id), + true, + @telemetry) end - begin - new_es = Fluent::MultiEventStream.new - time = Time.now - if ExtensionUtils.isAADMSIAuthMode() - $log.info("filter_health_model_builder::enumerate: AAD AUTH MSI MODE") - if @rewrite_tag.nil? || !@rewrite_tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @rewrite_tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_HEALTH_DATA_TYPE) - end - $log.info("filter_health_model_builder::filter_stream: using tag -#{@rewrite_tag} @ #{Time.now.utc.iso8601}") - end - - if tag.start_with?("kubehealth.DaemonSet.Node") - node_records = [] - if !es.nil? - es.each{|time, record| - node_records.push(record) - } - @buffer.add_to_buffer(node_records) - end - return Fluent::MultiEventStream.new - elsif tag.start_with?("kubehealth.DaemonSet.Container") - container_records = [] - if !es.nil? - es.each{|time, record| - container_records.push(record) - } - end - container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) - if @container_cpu_memory_records.nil? - @log.info "@container_cpu_memory_records was not initialized" - @container_cpu_memory_records = [] #in some clusters, this is null, so initialize it again. - end - @container_cpu_memory_records.push(*container_records) # push the records for aggregation later - return Fluent::MultiEventStream.new - elsif tag.start_with?("kubehealth.ReplicaSet") - records = [] - es.each{|time, record| - records.push(record) - } - @buffer.add_to_buffer(records) # in_kube_health records - - aggregated_container_records = [] - if !@container_cpu_memory_records.nil? && !@container_cpu_memory_records.empty? - container_records_aggregator = HealthContainerCpuMemoryAggregator.new(@resources, @provider) - deduped_records = container_records_aggregator.dedupe_records(@container_cpu_memory_records) - container_records_aggregator.aggregate(deduped_records) - container_records_aggregator.compute_state - aggregated_container_records = container_records_aggregator.get_records - end - @buffer.add_to_buffer(aggregated_container_records) #container cpu/memory records - records_to_process = @buffer.get_buffer - @buffer.reset_buffer - @container_cpu_memory_records = [] - health_monitor_records = [] - records_to_process.each do |record| - monitor_instance_id = record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID] - monitor_id = record[HealthMonitorRecordFields::MONITOR_ID] - #HealthMonitorRecord - health_monitor_record = HealthMonitorRecord.new( - record[HealthMonitorRecordFields::MONITOR_ID], - record[HealthMonitorRecordFields::MONITOR_INSTANCE_ID], - record[HealthMonitorRecordFields::TIME_FIRST_OBSERVED], - record[HealthMonitorRecordFields::DETAILS]["state"], - @provider.get_labels(record), - @provider.get_config(monitor_id), - record[HealthMonitorRecordFields::DETAILS] - ) - health_monitor_records.push(health_monitor_record) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - end + instance_state = @state.get_state(monitor_instance_id) + #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" + should_send = instance_state.should_send - @log.info "health_monitor_records.size #{health_monitor_records.size}" - # Dedupe daemonset signals - # Remove unit monitor signals for “gone” objects - # update state for the reduced set of signals - reduced_records = @reducer.reduce_signals(health_monitor_records, @resources) - reduced_records.each{|record| - @state.update_state(record, - @provider.get_config(record.monitor_id), - false, - @telemetry - ) - # get the health state based on the monitor's operational state - # update state calls updates the state of the monitor based on configuration and history of the the monitor records - record.state = @state.get_state(record.monitor_instance_id).new_state - } - @log.info "after deduping and removing gone objects reduced_records.size #{reduced_records.size}" - - reduced_records = @kube_api_down_handler.handle_kube_api_down(reduced_records) - @log.info "after kube api down handler health_monitor_records.size #{health_monitor_records.size}" - - #get the list of 'none' and 'unknown' signals - missing_signals = @generator.get_missing_signals(@@cluster_id, reduced_records, @resources, @provider) - - @log.info "after getting missing signals missing_signals.size #{missing_signals.size}" - #update state for missing signals - missing_signals.each{|signal| - - @state.update_state(signal, @provider.get_config(signal.monitor_id), false, @telemetry) - @log.info "After Updating #{@state.get_state(signal.monitor_instance_id)} #{@state.get_state(signal.monitor_instance_id).new_state}" - # for unknown/none records, update the "monitor state" to be the latest state (new_state) of the monitor instance from the state - signal.state = @state.get_state(signal.monitor_instance_id).new_state - } - - @generator.update_last_received_records(reduced_records) - all_records = reduced_records.clone - all_records.push(*missing_signals) - - @log.info "after Adding missing signals all_records.size #{all_records.size}" - - HealthMonitorHelpers.add_agentpool_node_label_if_not_present(all_records) - - # build the health model - @model_builder.process_records(all_records) - all_monitors = @model_builder.finalize_model - - @log.info "after building health_model #{all_monitors.size}" - - # update the state for aggregate monitors (unit monitors are updated above) - all_monitors.each{|monitor_instance_id, monitor| - if monitor.is_aggregate_monitor - @state.update_state(monitor, - @provider.get_config(monitor.monitor_id), - true, - @telemetry - ) - end - - instance_state = @state.get_state(monitor_instance_id) - #puts "#{monitor_instance_id} #{instance_state.new_state} #{instance_state.old_state} #{instance_state.should_send}" - should_send = instance_state.should_send - - # always send cluster monitor as a heartbeat - if !should_send && monitor_instance_id != MonitorId::CLUSTER - all_monitors.delete(monitor_instance_id) - end - } - - @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" - - # for each key in monitor.keys, - # get the state from health_monitor_state - # generate the record to send - emit_time = Fluent::Engine.now - all_monitors.keys.each{|key| - record = @provider.get_record(all_monitors[key], state) - if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER - if !record[HealthMonitorRecordFields::DETAILS].nil? - details = JSON.parse(record[HealthMonitorRecordFields::DETAILS]) - details[HealthMonitorRecordFields::HEALTH_MODEL_DEFINITION_VERSION] = "#{ENV['HEALTH_MODEL_DEFINITION_VERSION']}" - record[HealthMonitorRecordFields::DETAILS] = details.to_json - end - if all_monitors.size > 1 - old_state = record[HealthMonitorRecordFields::OLD_STATE] - new_state = record[HealthMonitorRecordFields::NEW_STATE] - if old_state != new_state && @cluster_old_state != old_state && @cluster_new_state != new_state - ApplicationInsightsUtility.sendCustomEvent("HealthModel_ClusterStateChanged",{"old_state" => old_state , "new_state" => new_state, "monitor_count" => all_monitors.size}) - @log.info "sent telemetry for cluster state change from #{record['OldState']} to #{record['NewState']}" - @cluster_old_state = old_state - @cluster_new_state = new_state - end - end - end - new_es.add(emit_time, record) - } - - #emit the stream - router.emit_stream(@rewrite_tag, new_es) - - #initialize monitor_set and model_builder - @monitor_set = HealthModel::MonitorSet.new - @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) - - #update cluster state custom resource - @cluster_health_state.update_state(@state.to_h) - @telemetry.send - # return an empty event stream, else the match will throw a NoMethodError - return Fluent::MultiEventStream.new - elsif tag.start_with?(@rewrite_tag) - # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream - es - else - raise "Invalid tag #{tag} received" + # always send cluster monitor as a heartbeat + if !should_send && monitor_instance_id != MonitorId::CLUSTER + all_monitors.delete(monitor_instance_id) + end + } + + @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" + + # for each key in monitor.keys, + # get the state from health_monitor_state + # generate the record to send + emit_time = Fluent::Engine.now + all_monitors.keys.each { |key| + record = @provider.get_record(all_monitors[key], state) + if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER + if !record[HealthMonitorRecordFields::DETAILS].nil? + details = JSON.parse(record[HealthMonitorRecordFields::DETAILS]) + details[HealthMonitorRecordFields::HEALTH_MODEL_DEFINITION_VERSION] = "#{ENV["HEALTH_MODEL_DEFINITION_VERSION"]}" + record[HealthMonitorRecordFields::DETAILS] = details.to_json + end + if all_monitors.size > 1 + old_state = record[HealthMonitorRecordFields::OLD_STATE] + new_state = record[HealthMonitorRecordFields::NEW_STATE] + if old_state != new_state && @cluster_old_state != old_state && @cluster_new_state != new_state + ApplicationInsightsUtility.sendCustomEvent("HealthModel_ClusterStateChanged", { "old_state" => old_state, "new_state" => new_state, "monitor_count" => all_monitors.size }) + @log.info "sent telemetry for cluster state change from #{record["OldState"]} to #{record["NewState"]}" + @cluster_old_state = old_state + @cluster_new_state = new_state end - - rescue => e - ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) - @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" - return nil + end end + new_es.add(emit_time, record) + } + + #emit the stream + router.emit_stream(@rewrite_tag, new_es) + + #initialize monitor_set and model_builder + @monitor_set = HealthModel::MonitorSet.new + @model_builder = HealthModel::HealthModelBuilder.new(@hierarchy_builder, @state_finalizers, @monitor_set) + + #update cluster state custom resource + @cluster_health_state.update_state(@state.to_h) + @telemetry.send + # return an empty event stream, else the match will throw a NoMethodError + return Fluent::MultiEventStream.new + elsif tag.start_with?(@rewrite_tag) + # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream + es + else + raise "Invalid tag #{tag} received" end + rescue => e + ApplicationInsightsUtility.sendExceptionTelemetry(e, { "FeatureArea" => "Health" }) + @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" + return nil + end end + end end diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index 862e88e44..aba24ecc2 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -64,12 +64,12 @@ def enumerate() begin eventStream = Fluent::MultiEventStream.new insightsMetricsEventStream = Fluent::MultiEventStream.new - metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) + metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime) metricData.each do |record| eventStream.add(time, record) if record end - if ExtensionUtils.isAADMSIAuthMode() + if ExtensionUtils.isAADMSIAuthMode() && !@@isWindows.nil? && @@isWindows == false $log.info("in_cadvisor_perf::enumerate: AAD AUTH MSI MODE") if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) @@ -77,7 +77,7 @@ def enumerate() if @insightsmetricstag.nil? || !@insightsmetricstag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @insightsmetricstag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) end - $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") $log.info("in_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsmetricstag} @ #{Time.now.utc.iso8601}") end router.emit_stream(@tag, eventStream) if eventStream @@ -95,9 +95,9 @@ def enumerate() containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) - containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord - end + containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord + end router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index cb52243a0..fa6477856 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -1,17 +1,17 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - def initialize (kubernetesApiClient=nil, - applicationInsightsUtility=nil, - extensionUtils=nil, - env=nil, - telemetry_flush_interval=nil) + def initialize(kubernetesApiClient = nil, + applicationInsightsUtility = nil, + extensionUtils = nil, + env = nil, + telemetry_flush_interval = nil) super() require "yaml" @@ -37,7 +37,6 @@ def initialize (kubernetesApiClient=nil, @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - @@rsPromInterval = @env["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = @env["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@rsPromFieldDropCount = @env["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] @@ -119,6 +118,32 @@ def enumerate nodeInventory = nil currentTime = Time.now batchTime = currentTime.utc.iso8601 + nodeCount = 0 + + @nodesAPIE2ELatencyMs = 0 + @nodeInventoryE2EProcessingLatencyMs = 0 + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + + if @extensionUtils.isAADMSIAuthMode() + $log.info("in_kube_nodes::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = @extensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = @extensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if @ContainerNodeInventoryTag.nil? || !@ContainerNodeInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @ContainerNodeInventoryTag = @extensionUtils.getOutputStreamId(Constants::CONTAINER_NODE_INVENTORY_DATA_TYPE) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = @extensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + end + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = 0 @nodeInventoryE2EProcessingLatencyMs = 0 @@ -138,7 +163,7 @@ def enumerate if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = @extensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) end - $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") @@ -155,6 +180,7 @@ def enumerate nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + nodeCount += nodeInventory["items"].length $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else @@ -168,6 +194,7 @@ def enumerate nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + nodeCount += nodeInventory["items"].length $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else @@ -181,6 +208,7 @@ def enumerate if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) + @applicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, {}) @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end # Setting this to nil so that we dont hold memory until GC kicks in @@ -208,9 +236,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream - $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -223,7 +251,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -272,7 +300,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -302,7 +330,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -572,12 +600,12 @@ def getNodeTelemetryProps(item) return properties end end # Kube_Node_Input + class NodeStatsCache # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) # (to reduce code duplication) class NodeCache - - @@RECORD_TIME_TO_LIVE = 60*20 # units are seconds, so clear the cache every 20 minutes. + @@RECORD_TIME_TO_LIVE = 60 * 20 # units are seconds, so clear the cache every 20 minutes. def initialize @cacheHash = {} @@ -622,7 +650,7 @@ def clean_cache() end end - nodes_to_remove.each {|node_name| + nodes_to_remove.each { |node_name| @cacheHash.delete(node_name) @timeAdded.delete(node_name) } @@ -630,7 +658,6 @@ def clean_cache() end end # NodeCache - @@cpuCache = NodeCache.new @@memCache = NodeCache.new diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 5a33ef790..f979ef7c5 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin require_relative "podinventory_to_mdm" @@ -34,9 +34,16 @@ def initialize @PODS_EMIT_STREAM_BATCH_SIZE = 0 @podCount = 0 + @containerCount = 0 @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 + @windowsNodeCount = 0 + @winContainerInventoryTotalSizeBytes = 0 + @winContainerCountWithInventoryRecordSize64KBOrMore = 0 + @winContainerCountWithEnvVarSize64KBOrMore = 0 + @winContainerCountWithPortsSize64KBOrMore = 0 + @winContainerCountWithCommandSize64KBOrMore = 0 @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 @podsAPIE2ELatencyMs = 0 @@ -99,9 +106,16 @@ def enumerate(podList = nil) podInventory = podList telemetryFlush = false @podCount = 0 + @containerCount = 0 @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 + @winContainerInventoryTotalSizeBytes = 0 + @winContainerCountWithInventoryRecordSize64KBOrMore = 0 + @winContainerCountWithEnvVarSize64KBOrMore = 0 + @winContainerCountWithPortsSize64KBOrMore = 0 + @winContainerCountWithCommandSize64KBOrMore = 0 + @windowsNodeCount = 0 @controllerData = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 @@ -109,27 +123,27 @@ def enumerate(podList = nil) @podInventoryE2EProcessingLatencyMs = 0 podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") - if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end - if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) - end - if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) - end - if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) - end - $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) + end + if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") end # Get services first so that we dont need to make a call for very chunk @@ -201,11 +215,24 @@ def enumerate(podList = nil) telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ContainerCount", @containerCount, {}) ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {}) telemetryProperties["ControllerData"] = @controllerData.to_json ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) if @winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount + telemetryProperties["WindowsNodeCount"] = @windowsNodeCount + telemetryProperties["ClusterWideWindowsContainerInventoryTotalSizeKB"] = @winContainerInventoryTotalSizeBytes / 1024 + telemetryProperties["WindowsContainerCountWithInventoryRecordSize64KBorMore"] = @winContainerCountWithInventoryRecordSize64KBOrMore + if @winContainerCountWithEnvVarSize64KBOrMore > 0 + telemetryProperties["WinContainerCountWithEnvVarSize64KBOrMore"] = @winContainerCountWithEnvVarSize64KBOrMore + end + if @winContainerCountWithPortsSize64KBOrMore > 0 + telemetryProperties["WinContainerCountWithPortsSize64KBOrMore"] = @winContainerCountWithPortsSize64KBOrMore + end + if @winContainerCountWithCommandSize64KBOrMore > 0 + telemetryProperties["WinContainerCountWithCommandSize64KBOrMore"] = @winContainerCountWithCommandSize64KBOrMore + end ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) end ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyMs", @podInventoryE2EProcessingLatencyMs, telemetryProperties) @@ -235,6 +262,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventory["items"].each do |item| #podInventory block start # pod inventory records podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) + @containerCount += podInventoryRecords.length podInventoryRecords.each do |record| if !record.nil? eventStream.add(emitTime, record) if record @@ -248,6 +276,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if !item["spec"]["nodeName"].nil? nodeName = item["spec"]["nodeName"] end + @windowsNodeCount = winNodes.length if (!nodeName.empty? && (winNodes.include? nodeName)) clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel @@ -257,13 +286,27 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerInventoryRecords.each do |cirecord| if !cirecord.nil? containerInventoryStream.add(emitTime, cirecord) if cirecord + ciRecordSize = cirecord.to_s.length + @winContainerInventoryTotalSizeBytes += ciRecordSize + if ciRecordSize >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithInventoryRecordSize64KBOrMore += 1 + end + if !cirecord["EnvironmentVar"].nil? && !cirecord["EnvironmentVar"].empty? && cirecord["EnvironmentVar"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithEnvVarSize64KBOrMore += 1 + end + if !cirecord["Ports"].nil? && !cirecord["Ports"].empty? && cirecord["Ports"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithPortsSize64KBOrMore += 1 + end + if !cirecord["Command"].nil? && !cirecord["Command"].empty? && cirecord["Command"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithCommandSize64KBOrMore += 1 + end end end end end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -283,7 +326,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -302,7 +345,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -367,7 +410,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName kubeServicesEventStream.add(emitTime, kubeServiceRecord) if kubeServiceRecord if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream kubeServicesEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 6af3c280f..b548bd5c2 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin class Kube_PVInventory_Input < Input @@ -107,7 +107,6 @@ def enumerate ApplicationInsightsUtility.sendCustomEvent(Constants::PV_INVENTORY_HEART_BEAT_EVENT, telemetryProperties) @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i end - rescue => errorStr $log.warn "in_kube_pvinventory::enumerate:Failed in enumerate: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -131,7 +130,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) record["ClusterName"] = KubernetesApiClient.getClusterName record["PVName"] = item["metadata"]["name"] record["PVStatus"] = item["status"]["phase"] - record["PVAccessModes"] = item["spec"]["accessModes"].join(', ') + record["PVAccessModes"] = item["spec"]["accessModes"].join(", ") record["PVStorageClassName"] = item["spec"]["storageClassName"] record["PVCapacityBytes"] = KubernetesApiClient.getMetricNumericValue("memory", item["spec"]["capacity"]["storage"]) record["PVCreationTimeStamp"] = item["metadata"]["creationTimestamp"] @@ -167,7 +166,6 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePVInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - rescue => errorStr $log.warn "Failed in parse_and_emit_record for in_kube_pvinventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -212,7 +210,6 @@ def getTypeInfo(item) # Can only have one type: return right away when found return pvType, typeInfo - end end end @@ -226,7 +223,6 @@ def getTypeInfo(item) return nil, {} end - def run_periodic @mutex.lock done = @finished diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index e31407b54..c9114d9e8 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -47,99 +47,106 @@ def get_node_allocatable(cpu_capacity, memory_capacity) @log.error "kubelet_utils.rb::get_node_allocatble - cpu_capacity or memory_capacity values not set. Hence we cannot calculate allocatable values" end + cpu_capacity = BigDecimal(cpu_capacity, 2).to_f + memory_capacity = BigDecimal(memory_capacity, 2).to_f + cpu_allocatable = 1.0 memory_allocatable = 1.0 - + allocatable_response = CAdvisorMetricsAPIClient.getCongifzCAdvisor(winNode: nil) parsed_response = JSON.parse(allocatable_response.body) begin kubereserved_cpu = parsed_response["kubeletconfig"]["kubeReserved"]["cpu"] if kubereserved_cpu.nil? || kubereserved_cpu == "" - kubereserved_cpu = "0" + kubereserved_cpu = "0.0" end @log.info "get_node_allocatable::kubereserved_cpu #{kubereserved_cpu}" rescue => errorStr @log.error "Error in get_node_allocatable::kubereserved_cpu: #{errorStr}" - kubereserved_cpu = "0" + kubereserved_cpu = "0.0" ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") - end + end begin kubereserved_memory = parsed_response["kubeletconfig"]["kubeReserved"]["memory"] if kubereserved_memory.nil? || kubereserved_memory == "" - kubereserved_memory = "0" + kubereserved_memory = "0.0" end @log.info "get_node_allocatable::kubereserved_memory #{kubereserved_memory}" rescue => errorStr @log.error "Error in get_node_allocatable::kubereserved_memory: #{errorStr}" - kubereserved_memory = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") - end + kubereserved_memory = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_memory: #{errorStr}") + end begin systemReserved_cpu = parsed_response["kubeletconfig"]["systemReserved"]["cpu"] if systemReserved_cpu.nil? || systemReserved_cpu == "" - systemReserved_cpu = "0" + systemReserved_cpu = "0.0" end @log.info "get_node_allocatable::systemReserved_cpu #{systemReserved_cpu}" rescue => errorStr # this will likely always reach this condition for AKS ~ only applicable for hyrid + MDM combination @log.error "Error in get_node_allocatable::systemReserved_cpu: #{errorStr}" - systemReserved_cpu = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") - end + systemReserved_cpu = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::systemReserved_cpu: #{errorStr}") + end begin explicitlyReserved_cpu = parsed_response["kubeletconfig"]["reservedCPUs"] if explicitlyReserved_cpu.nil? || explicitlyReserved_cpu == "" - explicitlyReserved_cpu = "0" + explicitlyReserved_cpu = "0.0" end @log.info "get_node_allocatable::explicitlyReserved_cpu #{explicitlyReserved_cpu}" rescue => errorStr # this will likely always reach this condition for AKS ~ only applicable for hyrid + MDM combination @log.error "Error in get_node_allocatable::explicitlyReserved_cpu: #{errorStr}" - explicitlyReserved_cpu = "0" + explicitlyReserved_cpu = "0.0" ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::explicitlyReserved_cpu: #{errorStr}") - end + end begin - systemReserved_memory = parsed_response["kubeletconfig"]["systemReserved"]["memory"] - if systemReserved_memory.nil? || systemReserved_memory == "" - systemReserved_memory = "0" - end - @log.info "get_node_allocatable::systemReserved_memory #{systemReserved_memory}" + systemReserved_memory = parsed_response["kubeletconfig"]["systemReserved"]["memory"] + if systemReserved_memory.nil? || systemReserved_memory == "" + systemReserved_memory = "0.0" + end + @log.info "get_node_allocatable::systemReserved_memory #{systemReserved_memory}" rescue => errorStr - @log.error "Error in get_node_allocatable::systemReserved_memory: #{errorStr}" - systemReserved_memory = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") - end + @log.error "Error in get_node_allocatable::systemReserved_memory: #{errorStr}" + systemReserved_memory = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::systemReserved_memory: #{errorStr}") + end begin evictionHard_memory = parsed_response["kubeletconfig"]["evictionHard"]["memory.available"] if evictionHard_memory.nil? || evictionHard_memory == "" - evictionHard_memory = "0" + evictionHard_memory = "0.0" end @log.info "get_node_allocatable::evictionHard_memory #{evictionHard_memory}" rescue => errorStr @log.error "Error in get_node_allocatable::evictionHard_memory: #{errorStr}" - evictionHard_memory = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") - end + evictionHard_memory = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::evictionHard_memory: #{errorStr}") + end # do calculation in nanocore since that's what KubernetesApiClient.getMetricNumericValue expects cpu_capacity_number = cpu_capacity.to_i * 1000.0 ** 2 # subtract to get allocatable. Formula : Allocatable = Capacity - ( kube reserved + system reserved + eviction threshold ) # https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable if KubernetesApiClient.getMetricNumericValue("cpu", explicitlyReserved_cpu) > 0 - cpu_allocatable = cpu_capacity_number - KubernetesApiClient.getMetricNumericValue("cpu", explicitlyReserved_cpu) + cpu_allocatable = cpu_capacity_number - KubernetesApiClient.getMetricNumericValue("cpu", explicitlyReserved_cpu) else - cpu_allocatable = cpu_capacity_number - (KubernetesApiClient.getMetricNumericValue("cpu", kubereserved_cpu) + KubernetesApiClient.getMetricNumericValue("cpu", systemReserved_cpu)) + cpu_allocatable = cpu_capacity_number - (KubernetesApiClient.getMetricNumericValue("cpu", kubereserved_cpu) + KubernetesApiClient.getMetricNumericValue("cpu", systemReserved_cpu)) end # convert back to units similar to what we get for capacity cpu_allocatable = cpu_allocatable / (1000.0 ** 2) - @log.info "CPU Allocatable #{cpu_allocatable}" memory_allocatable = memory_capacity - (KubernetesApiClient.getMetricNumericValue("memory", kubereserved_memory) + KubernetesApiClient.getMetricNumericValue("memory", systemReserved_memory) + KubernetesApiClient.getMetricNumericValue("memory", evictionHard_memory)) + + cpu_allocatable = BigDecimal(cpu_allocatable, 2).to_f + memory_allocatable = BigDecimal(memory_allocatable, 2).to_f + + @log.info "CPU Allocatable #{cpu_allocatable}" @log.info "Memory Allocatable #{memory_allocatable}" return [cpu_allocatable, memory_allocatable] diff --git a/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb b/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb index 4786aa1d9..df2138b3a 100644 --- a/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb +++ b/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb @@ -1,5 +1,5 @@ -require_relative 'sender_base' -require 'thread' +require_relative "sender_base" +require "thread" module ApplicationInsights module Channel @@ -17,12 +17,13 @@ module Channel # If no queue items are found for {#send_time} seconds, the worker thread # will shut down (and {#start} will need to be called again). class AsynchronousSender < SenderBase - SERVICE_ENDPOINT_URI = 'https://dc.services.visualstudio.com/v2/track' + SERVICE_ENDPOINT_URI = "https://dc.services.visualstudio.com/v2/track" # Initializes a new instance of the class. # @param [String] service_endpoint_uri the address of the service to send + # @param [Logger] instance of the logger to write the logs (optional) # @param [Hash] proxy server configuration to send (optional) # telemetry data to. - def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {}) + def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, logger = nil, proxy = {}) # callers which requires proxy dont require to maintain service endpoint uri which potentially can change if service_endpoint_uri.nil? || service_endpoint_uri.empty? service_endpoint_uri = SERVICE_ENDPOINT_URI @@ -33,7 +34,7 @@ def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {}) @lock_work_thread = Mutex.new @work_thread = nil @start_notification_processed = true - super service_endpoint_uri, proxy + super service_endpoint_uri, logger, proxy end # The time span in seconds at which the the worker thread will check the @@ -130,7 +131,7 @@ def run rescue Exception => e # Make sure work_thread sets to nil when it terminates abnormally @work_thread = nil - @logger.error('application_insights') { "Asynchronous sender work thread terminated abnormally: #{e.to_s}" } + @logger.error("application_insights") { "Asynchronous sender work thread terminated abnormally: #{e.to_s}" } end end end diff --git a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb index bedbae4ee..e5a4dea62 100644 --- a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb +++ b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb @@ -1,9 +1,9 @@ -require 'yajl/json_gem' -require 'net/http' -require 'openssl' -require 'stringio' -require 'zlib' -require 'logger' +require "yajl/json_gem" +require "net/http" +require "openssl" +require "stringio" +require "zlib" +require "logger" module ApplicationInsights module Channel @@ -16,13 +16,18 @@ module Channel class SenderBase # Initializes a new instance of the class. # @param [String] service_endpoint_uri the address of the service to send + # @param [Logger] instance of the logger to write the logs # @param [Hash] proxy server configuration to send (optional) # telemetry data to. - def initialize(service_endpoint_uri, proxy = {}) + def initialize(service_endpoint_uri, logger, proxy = {}) @service_endpoint_uri = service_endpoint_uri @queue = nil @send_buffer_size = 100 - @logger = Logger.new(STDOUT) + if !logger.nil? + @logger = logger + else + @logger = Logger.new(STDOUT) + end @proxy = proxy end @@ -53,9 +58,9 @@ def initialize(service_endpoint_uri, proxy = {}) def send(data_to_send) uri = URI(@service_endpoint_uri) headers = { - 'Accept' => 'application/json', - 'Content-Type' => 'application/json; charset=utf-8', - 'Content-Encoding' => 'gzip' + "Accept" => "application/json", + "Content-Type" => "application/json; charset=utf-8", + "Content-Encoding" => "gzip", } request = Net::HTTP::Post.new(uri.path, headers) @@ -69,7 +74,7 @@ def send(data_to_send) else http = Net::HTTP.new(uri.hostname, uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass]) end - if uri.scheme.downcase == 'https' + if uri.scheme.downcase == "https" http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_PEER end @@ -78,7 +83,7 @@ def send(data_to_send) http.finish if http.started? if !response.kind_of? Net::HTTPSuccess - @logger.warn('application_insights') { "Failed to send data: #{response.message}" } + @logger.warn("application_insights") { "Failed to send data: #{response.message}" } end end diff --git a/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb b/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb index 597e97b9e..2bb212026 100644 --- a/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb +++ b/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb @@ -8,14 +8,15 @@ class SynchronousSender < SenderBase SERVICE_ENDPOINT_URI = "https://dc.services.visualstudio.com/v2/track" # Initializes a new instance of the class. # @param [String] service_endpoint_uri the address of the service to send + # @param [Logger] instance of the logger to write the logs (optional) # @param [Hash] proxy server configuration to send (optional) # telemetry data to. - def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {}) + def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, logger = nil, proxy = {}) # callers which requires proxy dont require to maintain service endpoint uri which potentially can change if service_endpoint_uri.nil? || service_endpoint_uri.empty? service_endpoint_uri = SERVICE_ENDPOINT_URI end - super service_endpoint_uri, proxy + super service_endpoint_uri, logger, proxy end end end diff --git a/test/e2e/conformance.yaml b/test/e2e/conformance.yaml index ff790e690..71e40a6a2 100644 --- a/test/e2e/conformance.yaml +++ b/test/e2e/conformance.yaml @@ -3,7 +3,7 @@ sonobuoy-config: plugin-name: azure-arc-ci-conformance result-format: junit spec: - image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest08142021 + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest10202021 imagePullPolicy: Always name: plugin resources: {} diff --git a/test/e2e/src/common/constants.py b/test/e2e/src/common/constants.py index 392b10554..c557a1c91 100644 --- a/test/e2e/src/common/constants.py +++ b/test/e2e/src/common/constants.py @@ -40,6 +40,9 @@ TIMEOUT = 300 +# omsagent main container name +OMSAGENT_MAIN_CONTAINER_NAME = 'omsagent' + # WAIT TIME BEFORE READING THE AGENT LOGS AGENT_WAIT_TIME_SECS = "180" # Azure Monitor for Container Extension related diff --git a/test/e2e/src/common/kubernetes_pod_utility.py b/test/e2e/src/common/kubernetes_pod_utility.py index 27345fae7..d70f443f0 100644 --- a/test/e2e/src/common/kubernetes_pod_utility.py +++ b/test/e2e/src/common/kubernetes_pod_utility.py @@ -20,12 +20,12 @@ def get_pod_list(api_instance, namespace, label_selector=""): pytest.fail("Error occurred when retrieving pod information: " + str(e)) # get the content of the log file in the container via exec -def get_log_file_content(api_instance, namespace, podName, logfilePath): +def get_log_file_content(api_instance, namespace, podName, containerName, logfilePath): try: exec_command = ['tar','cf', '-', logfilePath] - return stream(api_instance.connect_get_namespaced_pod_exec, podName, namespace, command=exec_command, stderr=True, stdin=False, stdout=True, tty=False) + return stream(api_instance.connect_get_namespaced_pod_exec, podName, namespace, command=exec_command, container=containerName, stderr=True, stdin=False, stdout=True, tty=False) except Exception as e: - pytest.fail("Error occurred when retrieving log file content: " + str(e)) + pytest.fail("Error occurred when retrieving log file content: " + str(e)) # Function that watches events corresponding to pods in the given namespace and passes the events to a callback function def watch_pod_status(api_instance, namespace, timeout, callback=None): diff --git a/test/e2e/src/core/Dockerfile b/test/e2e/src/core/Dockerfile index cd85aee40..52bcd7cf8 100644 --- a/test/e2e/src/core/Dockerfile +++ b/test/e2e/src/core/Dockerfile @@ -6,7 +6,7 @@ RUN curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | && helm version RUN apt-get update && apt-get -y upgrade && \ - apt-get -f -y install curl apt-transport-https lsb-release gnupg python3-pip python-pip && \ + apt-get -f -y install curl apt-transport-https lsb-release gnupg python3-pip && \ curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > /etc/apt/trusted.gpg.d/microsoft.asc.gpg && \ CLI_REPO=$(lsb_release -cs) && \ echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ ${CLI_REPO} main" \ diff --git a/test/e2e/src/tests/test_ds_workflows.py b/test/e2e/src/tests/test_ds_workflows.py index 731957788..e6d651e49 100755 --- a/test/e2e/src/tests/test_ds_workflows.py +++ b/test/e2e/src/tests/test_ds_workflows.py @@ -51,7 +51,7 @@ def test_ds_workflows(env_dict): for podItem in pod_list.items: podName = podItem.metadata.name logcontent = get_log_file_content( - api_instance, constants.AGENT_RESOURCES_NAMESPACE, podName, agentLogPath) + api_instance, constants.AGENT_RESOURCES_NAMESPACE, podName, constants.OMSAGENT_MAIN_CONTAINER_NAME, agentLogPath) if not logcontent: pytest.fail("logcontent should not be null or empty for pod: " + podName) loglines = logcontent.split("\n") diff --git a/test/e2e/src/tests/test_resource_status.py b/test/e2e/src/tests/test_resource_status.py index e09c1ea5e..c240cbcf2 100755 --- a/test/e2e/src/tests/test_resource_status.py +++ b/test/e2e/src/tests/test_resource_status.py @@ -1,5 +1,6 @@ import pytest import constants +import time from kubernetes import client, config from results_utility import append_result_output @@ -20,6 +21,10 @@ def test_resource_status(env_dict): #config.load_kube_config() except Exception as e: pytest.fail("Error loading the in-cluster config: " + str(e)) + + waitTimeSeconds = env_dict['AGENT_WAIT_TIME_SECS'] + time.sleep(int(waitTimeSeconds)) + # checking the deployment status check_kubernetes_deployment_status( constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DEPLOYMENT_NAME, env_dict['TEST_AGENT_LOG_FILE']) diff --git a/test/e2e/src/tests/test_rs_workflows.py b/test/e2e/src/tests/test_rs_workflows.py index 0a74dc6de..1265f1b47 100755 --- a/test/e2e/src/tests/test_rs_workflows.py +++ b/test/e2e/src/tests/test_rs_workflows.py @@ -39,9 +39,7 @@ def test_rs_workflows(env_dict): waitTimeSeconds = env_dict['AGENT_WAIT_TIME_SECS'] - print("start: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) time.sleep(int(waitTimeSeconds)) - print("complete: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) isOMSBaseAgent = env_dict.get('USING_OMSAGENT_BASE_AGENT') agentLogPath = constants.AGENT_FLUENTD_LOG_PATH @@ -49,7 +47,7 @@ def test_rs_workflows(env_dict): agentLogPath = constants.AGENT_OMSAGENT_LOG_PATH logcontent = get_log_file_content( - api_instance, constants.AGENT_RESOURCES_NAMESPACE, rspodName, agentLogPath) + api_instance, constants.AGENT_RESOURCES_NAMESPACE, rspodName, constants.OMSAGENT_MAIN_CONTAINER_NAME, agentLogPath) if not logcontent: pytest.fail("logcontent should not be null or empty for rs pod: {}".format(rspodName)) loglines = logcontent.split("\n")