diff --git a/Documentation/AgentSettings/ReadMe.md b/Documentation/AgentSettings/ReadMe.md new file mode 100644 index 000000000..3e55d7d44 --- /dev/null +++ b/Documentation/AgentSettings/ReadMe.md @@ -0,0 +1,26 @@ +## Configurable agent settings for high scale prometheus metric scraping using pod annotations with prometheus sidecar. + +Container Insights agent runs native prometheus telegraf plugin to scrape prometheus metrics using pod annotations. +The metrics scraped from the telegraf plugin are sent to the fluent bit tcp listener. +In order to support higher volumes of prometheus metrics scraping some of the tcp listener settings can be tuned. +[Fluent Bit TCP listener](https://docs.fluentbit.io/manual/pipeline/inputs/tcp) + +* Chunk Size - This can be increased to process bigger chunks of data. + +* Buffer Size - This should be greater than or equal to the chunk size. + +* Mem Buf Limit - This can be increased to increase the buffer size. But the memory limit on the sidecar also needs to be increased accordingly. +Note that this can only be achieved using helm chart today. + + +** Note - The LA ingestion team also states that higher chunk sizes might not necessarily mean higher throughput since there are pipeline limitations. + +``` + agent-settings: |- + # prometheus scrape fluent bit settings for high scale + # buffer size should be greater than or equal to chunk size else we set it to chunk size. + [agent_settings.prometheus_fbit_settings] + tcp_listener_chunk_size = 10 + tcp_listener_buffer_size = 10 + tcp_listener_mem_buf_limit = 200 +``` diff --git a/build/common/installer/scripts/tomlparser-prom-agent-config.rb b/build/common/installer/scripts/tomlparser-prom-agent-config.rb new file mode 100644 index 000000000..be9d08e59 --- /dev/null +++ b/build/common/installer/scripts/tomlparser-prom-agent-config.rb @@ -0,0 +1,102 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + +require_relative "ConfigParseErrorLogger" + +@configMapMountPath = "/etc/config/settings/agent-settings" +@configSchemaVersion = "" + +@promFbitChunkSize = 10 +@promFbitBufferSize = 10 +@promFbitMemBufLimit = 200 + +def is_number?(value) + true if Integer(value) rescue false +end + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for sidecar agent settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for sidecar agent settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for sidecar agent settings : #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? + # fbit config settings + prom_fbit_config = parsedConfig[:agent_settings][:prometheus_fbit_settings] + if !prom_fbit_config.nil? + chunk_size = prom_fbit_config[:tcp_listener_chunk_size] + if !chunk_size.nil? && is_number?(chunk_size) && chunk_size.to_i > 0 + @promFbitChunkSize = chunk_size.to_i + puts "Using config map value: AZMON_SIDECAR_FBIT_CHUNK_SIZE = #{@promFbitChunkSize.to_s + "m"}" + end + buffer_size = prom_fbit_config[:tcp_listener_buffer_size] + if !buffer_size.nil? && is_number?(buffer_size) && buffer_size.to_i > 0 + @promFbitBufferSize = buffer_size.to_i + puts "Using config map value: AZMON_SIDECAR_FBIT_BUFFER_SIZE = #{@promFbitBufferSize.to_s + "m"}" + if @promFbitBufferSize < @promFbitChunkSize + @promFbitBufferSize = @promFbitChunkSize + puts "Setting Fbit buffer size equal to chunk size since it is set to less than chunk size - AZMON_SIDECAR_FBIT_BUFFER_SIZE = #{@promFbitBufferSize.to_s + "m"}" + end + end + mem_buf_limit = prom_fbit_config[:tcp_listener_mem_buf_limit] + if !mem_buf_limit.nil? && is_number?(mem_buf_limit) && mem_buf_limit.to_i > 0 + @promFbitMemBufLimit = mem_buf_limit.to_i + puts "Using config map value: AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT = #{@promFbitMemBufLimit.to_s + "m"}" + end + end + end + rescue => errorStr + puts "config::error:Exception while reading config settings for sidecar agent configuration setting - #{errorStr}, using defaults" + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Sidecar Agent Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end + @enable_health_model = false +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("side_car_fbit_config_env_var", "w") + +if !file.nil? + file.write("export AZMON_SIDECAR_FBIT_CHUNK_SIZE=#{@promFbitChunkSize.to_s + "m"}\n") + file.write("export AZMON_SIDECAR_FBIT_BUFFER_SIZE=#{@promFbitBufferSize.to_s + "m"}\n") + file.write("export AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT=#{@promFbitMemBufLimit.to_s + "m"}\n") + # Close file after writing all environment variables + file.close +else + puts "Exception while opening file for writing config environment variables" + puts "****************End Sidecar Agent Config Processing********************" +end diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 8a69f7995..2c85a4200 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -29,9 +29,9 @@ Tag oms.container.perf.telegraf.* Listen 0.0.0.0 Port 25229 - Chunk_Size 10m - Buffer_Size 10m - Mem_Buf_Limit 200m + Chunk_Size ${AZMON_SIDECAR_FBIT_CHUNK_SIZE} + Buffer_Size ${AZMON_SIDECAR_FBIT_BUFFER_SIZE} + Mem_Buf_Limit ${AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT} [OUTPUT] Name oms diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index de8ccbba0..88c790be3 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -42,7 +42,8 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/common/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-prom-agent-config.rb; build/common/installer/scripts/tomlparser-prom-agent-config.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/common/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 543f270c1..21b31f76f 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -135,6 +135,16 @@ data: [integrations.azure_network_policy_manager] collect_basic_metrics = false collect_advanced_metrics = false + +# Doc - https://github.com/microsoft/Docker-Provider/blob/ci_prod/Documentation/AgentSettings/ReadMe.md + agent-settings: |- + # prometheus scrape fluent bit settings for high scale + # buffer size should be greater than or equal to chunk size else we set it to chunk size. + [agent_settings.prometheus_fbit_settings] + tcp_listener_chunk_size = 10 + tcp_listener_buffer_size = 10 + tcp_listener_mem_buf_limit = 200 + metadata: name: container-azm-ms-agentconfig namespace: kube-system diff --git a/kubernetes/linux/defaultpromenvvariables-sidecar b/kubernetes/linux/defaultpromenvvariables-sidecar index 3301488d8..68388f88e 100644 --- a/kubernetes/linux/defaultpromenvvariables-sidecar +++ b/kubernetes/linux/defaultpromenvvariables-sidecar @@ -7,3 +7,6 @@ export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" +export AZMON_SIDECAR_FBIT_CHUNK_SIZE="10m" +export AZMON_SIDECAR_FBIT_BUFFER_SIZE="10m" +export AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT="200m" diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index b21ed6b96..ff8572ca8 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -306,6 +306,21 @@ if [ -e "telemetry_prom_config_env_var" ]; then source telemetry_prom_config_env_var fi +#Parse sidecar agent settings for custom configuration +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + #Parse the agent configmap to create a file with new custom settings. + /usr/bin/ruby2.6 tomlparser-prom-agent-config.rb + #Sourcing config environment variable file if it exists + if [ -e "side_car_fbit_config_env_var" ]; then + cat side_car_fbit_config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source side_car_fbit_config_env_var + fi + fi +fi + #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 4750b4624..debe003e4 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -161,6 +161,10 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { telemetryDimensions["OsmNamespaceCount"] = strconv.Itoa(osmNamespaceCount) } + telemetryDimensions["PromFbitChunkSize"] = os.Getenv("AZMON_SIDECAR_FBIT_CHUNK_SIZE") + telemetryDimensions["PromFbitBufferSize"] = os.Getenv("AZMON_SIDECAR_FBIT_BUFFER_SIZE") + telemetryDimensions["PromFbitMemBufLimit"] = os.Getenv("AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT") + SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) } else {