From 939e323b2f6979e87e04985677818159c7528e97 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Mon, 14 Jan 2019 14:06:51 -0800 Subject: [PATCH 01/38] add configuration for telegraf --- installer/conf/telegraf.conf | 439 +++++++++++++++++++++++++++++++++++ 1 file changed, 439 insertions(+) create mode 100644 installer/conf/telegraf.conf diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf new file mode 100644 index 000000000..31619c45a --- /dev/null +++ b/installer/conf/telegraf.conf @@ -0,0 +1,439 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" + # cluster = "$ACS_RESOURCE_NAME" + #node = $NODE_IP + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + hostname = "$nodename" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Send aggregate metrics to Azure Monitor +[[outputs.azure_monitor]] + ## Timeout for HTTP writes. + # timeout = "20s" + + ## Set the namespace prefix, defaults to "Telegraf/". + namespace_prefix = "ContainerInsights/" + + ## Azure Monitor doesn't have a string value type, so convert string + ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows + ## a maximum of 10 dimensions so Telegraf will only send the first 10 + ## alphanumeric dimensions. + strings_as_dimensions = false + + ## Both region and resource_id must be set or be available via the + ## Instance Metadata service on Azure Virtual Machines. + # + ## Azure Region to publish metrics against. + ## ex: region = "southcentralus" + #region = "westeurope" + # + ## The Azure Resource ID against which metric will be logged, e.g. + ## ex: resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" + # resource_id = "" + + + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Convert values to another metric value type +# [[processors.converter]] +# ## Tags to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.tags] +# string = ["device"] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] +# +# ## Fields to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.fields] +# tag = ["host"] +# string = [] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] + + +# # Map enum values according to given table. +# [[processors.enum]] +# [[processors.enum.mapping]] +# ## Name of the field to map +# field = "status" +# +# ## Destination field to be used for the mapped value. By default the source +# ## field is used, overwriting the original value. +# # dest = "status_code" +# +# ## Default value to be used for all values not contained in the mapping +# ## table. When unset, the unmodified value for the field will be used if no +# ## match is found. +# # default = 0 +# +# ## Table of mappings +# [processors.enum.mapping.value_mappings] +# green = 1 +# yellow = 2 +# red = 3 + + +# # Apply metric modifications using override semantics. +# [[processors.override]] +# ## All modifications on inputs and aggregators can be overridden: +# # name_override = "new_name" +# # name_prefix = "new_name_prefix" +# # name_suffix = "new_name_suffix" +# +# ## Tags to be added (all values must be strings) +# # [processors.override.tags] +# # additional_tag = "tag_value" + + +# # Parse a value in a specified field/tag(s) and add the result in a new metric +# [[processors.parser]] +# ## The name of the fields whose value will be parsed. +# parse_fields = [] +# +# ## If true, incoming metrics are not emitted. +# drop_original = false +# +# ## If set to override, emitted metrics will be merged by overriding the +# ## original metric using the newly parsed metrics. +# merge = "override" +# +# ## The dataformat to be read from files +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Print all metrics that pass through this filter. +# [[processors.printer]] + + +# # Transforms tag and field values with regex pattern +# [[processors.regex]] +# ## Tag and field conversions defined in a separate sub-tables +# # [[processors.regex.tags]] +# # ## Tag to change +# # key = "resp_code" +# # ## Regular expression to match on a tag value +# # pattern = "^(\\d)\\d\\d$" +# # ## Pattern for constructing a new value (${1} represents first subgroup) +# # replacement = "${1}xx" +# +# # [[processors.regex.fields]] +# # key = "request" +# # ## All the power of the Go regular expressions available here +# # ## For example, named subgroups +# # pattern = "^/api(?P/[\\w/]+)\\S*" +# # replacement = "${method}" +# # ## If result_key is present, a new field will be created +# # ## instead of changing existing field +# # result_key = "method" +# +# ## Multiple conversions may be applied for one field sequentially +# ## Let's extract one more value +# # [[processors.regex.fields]] +# # key = "request" +# # pattern = ".*category=(\\w+).*" +# # replacement = "${1}" +# # result_key = "search_category" + + +# # Rename measurements, tags, and fields that pass through this filter. +# [[processors.rename]] + + +# # Perform string processing on tags, fields, and measurements +# [[processors.strings]] +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_agregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurance of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +#[[inputs.cpu]] + ## Whether to report per-cpu stats or not +# percpu = false + ## Whether to report total system cpu stats or not +# totalcpu = true + ## If true, collect raw CPU time metrics. +# collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. +# report_active = true +# fieldpass = ["usage_active","cluster","node","host","device"] +# taginclude = ["cluster","cpu","node"] + + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] + fieldpass = ["used_percent", "used", "free", "total"] + taginclude = ["device","fstype","mode","path","host"] + + +# Read metrics about memory usage +#[[inputs.mem]] +# fieldpass = ["used_percent", "cluster", "node","host","device"] +# taginclude = ["cluster","node"] + + +# Read metrics about network interface usage +[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + # interfaces = ["eth0"] + ## + ## On linux systems telegraf also collects protocol stats. + ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. + ## + ignore_protocol_stats = true + ## + fieldpass = ["bytes_sent","bytes_recv","err_in", "err_out"] + taginclude = ["interface","host"] + From 93f70b9261475ad2bb91002dd1350e4dafe488e9 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Mon, 14 Jan 2019 14:07:17 -0800 Subject: [PATCH 02/38] fix for perms --- installer/datafiles/base_container.data | 2 ++ 1 file changed, 2 insertions(+) diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 7181929e2..966fe44ee 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -91,6 +91,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root +/etc/telegraf/telegraf.conf; installer/conf/telegraf.conf; 644; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root @@ -130,6 +131,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit; 755; root; root;sysdir /opt/td-agent-bit/bin; 755; root; root;sysdir +/etc/telegraf; 755; root; root;sysdir /opt/microsoft/omsagent/plugin/lib; 755; root; root; sysdir /opt/microsoft/omsagent/plugin/lib/application_insights; 755; root; root; sysdir From a5f32b82a798b83596774e6948116602eb0068ef Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Mon, 14 Jan 2019 16:10:54 -0800 Subject: [PATCH 03/38] fix telegraf config. --- installer/conf/telegraf.conf | 42 +++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 31619c45a..6891ae138 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -235,7 +235,47 @@ # # Perform string processing on tags, fields, and measurements -# [[processors.strings]] +[[processors.strings]] + [[processors.strings.replace]] + measurement = "disk" + old = "disk" + new = "node" + [[processors.strings.replace]] + field = "free" + old = "free" + new = "diskFreeBytes" + [[processors.strings.replace]] + field = "used" + old = "used" + new = "diskUsedBytes" + [[processors.strings.replace]] + field = "total" + old = "total" + new = "diskTotalBytes" + [[processors.strings.replace]] + field = "used_percent" + old = "used_percent" + new = "diskUsedPercentage" + [[processors.strings.replace]] + measurement = "net" + old = "net" + new = "node" + [[processors.strings.replace]] + field = "bytes_recv" + old = "bytes_recv" + new = "networkBytesReceived" + [[processors.strings.replace]] + field = "bytes_sent" + old = "bytes_sent" + new = "networkBytesReceived" + [[processors.strings.replace]] + field = "err_in" + old = "err_in" + new = "networkErrorIn" + [[processors.strings.replace]] + field = "err_out" + old = "err_out" + new = "networkErrorOut" # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" From a6c2d2b69b6163deee6d737fbd3e4d6255bee416 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 15 Jan 2019 10:10:05 -0800 Subject: [PATCH 04/38] fix file location & config --- installer/conf/telegraf.conf | 10 +++++----- installer/datafiles/base_container.data | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 6891ae138..c2ae42793 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -267,15 +267,15 @@ [[processors.strings.replace]] field = "bytes_sent" old = "bytes_sent" - new = "networkBytesReceived" + new = "networkBytesSent" [[processors.strings.replace]] field = "err_in" old = "err_in" - new = "networkErrorIn" + new = "networkErrorsIn" [[processors.strings.replace]] field = "err_out" old = "err_out" - new = "networkErrorOut" + new = "networkErrorsOut" # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" @@ -451,7 +451,7 @@ ## Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] - fieldpass = ["used_percent", "used", "free", "total"] + fieldpass = ["diskFreeBytes", "diskUsedBytes", "diskTotalBytes", "diskUsedPercentage"] taginclude = ["device","fstype","mode","path","host"] @@ -474,6 +474,6 @@ ## ignore_protocol_stats = true ## - fieldpass = ["bytes_sent","bytes_recv","err_in", "err_out"] + fieldpass = ["networkBytesReceived", "networkBytesSent", "networkErrorsIn", "networkErrorsOut"] taginclude = ["interface","host"] diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 966fe44ee..255e6ebfd 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -91,7 +91,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root -/etc/telegraf/telegraf.conf; installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root From cdfafaaaa9be8aac0b3581c5e6a7b66a7a8f16bc Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 15 Jan 2019 12:58:29 -0800 Subject: [PATCH 05/38] update to config --- installer/conf/telegraf.conf | 58 +++++++++++++++--------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index c2ae42793..f2e38e269 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -70,7 +70,7 @@ ## Run telegraf with debug log messages. debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = true + quiet = false ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" @@ -235,47 +235,37 @@ # # Perform string processing on tags, fields, and measurements -[[processors.strings]] - [[processors.strings.replace]] +[[processors.rename]] + [[processors.rename.replace]] measurement = "disk" - old = "disk" - new = "node" - [[processors.strings.replace]] + dest = "node" + [[processors.rename.replace]] field = "free" - old = "free" - new = "diskFreeBytes" - [[processors.strings.replace]] + dest = "diskFreeBytes" + [[processors.rename.replace]] field = "used" - old = "used" - new = "diskUsedBytes" - [[processors.strings.replace]] + dest = "diskUsedBytes" + [[processors.rename.replace]] field = "total" - old = "total" - new = "diskTotalBytes" - [[processors.strings.replace]] + dest = "diskTotalBytes" + [[processors.rename.replace]] field = "used_percent" - old = "used_percent" - new = "diskUsedPercentage" - [[processors.strings.replace]] + dest = "diskUsedPercentage" + [[processors.rename.replace]] measurement = "net" - old = "net" - new = "node" - [[processors.strings.replace]] + dest = "node" + [[processors.rename.replace]] field = "bytes_recv" - old = "bytes_recv" - new = "networkBytesReceived" - [[processors.strings.replace]] + dest = "networkBytesReceived" + [[processors.rename.replace]] field = "bytes_sent" - old = "bytes_sent" - new = "networkBytesSent" - [[processors.strings.replace]] + dest = "networkBytesSent" + [[processors.rename.replace]] field = "err_in" - old = "err_in" - new = "networkErrorsIn" - [[processors.strings.replace]] + dest = "networkErrorsIn" + [[processors.rename.replace]] field = "err_out" - old = "err_out" - new = "networkErrorsOut" + dest = "networkErrorsOut" # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" @@ -451,7 +441,7 @@ ## Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] - fieldpass = ["diskFreeBytes", "diskUsedBytes", "diskTotalBytes", "diskUsedPercentage"] + fieldpass = ["free", "used", "total", "used_percent"] taginclude = ["device","fstype","mode","path","host"] @@ -474,6 +464,6 @@ ## ignore_protocol_stats = true ## - fieldpass = ["networkBytesReceived", "networkBytesSent", "networkErrorsIn", "networkErrorsOut"] + fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] taginclude = ["interface","host"] From 5668ce72dfeedad8e8debec69c28290b8459dba9 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 15 Jan 2019 13:38:14 -0800 Subject: [PATCH 06/38] fix namespace --- installer/conf/telegraf.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index f2e38e269..2bc4b3625 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -90,7 +90,7 @@ # timeout = "20s" ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "ContainerInsights/" + namespace_prefix = "Container.Insights/" ## Azure Monitor doesn't have a string value type, so convert string ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows From 6b2472590f2df8af75eb0132f31ef4c6773ba790 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 15 Jan 2019 16:39:48 -0800 Subject: [PATCH 07/38] trying different namespace and also debug=true --- installer/conf/telegraf.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 2bc4b3625..9cbb287f6 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -68,7 +68,7 @@ ## Logging configuration: ## Run telegraf with debug log messages. - debug = false + debug = true ## Run telegraf in quiet mode (error log messages only). quiet = false ## Specify the log file name. The empty string means to log to stderr. @@ -90,7 +90,7 @@ # timeout = "20s" ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "Container.Insights/" + namespace_prefix = "Container.Insights3/" ## Azure Monitor doesn't have a string value type, so convert string ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows From 2cc4cf976e1d9cc0da38a1806fc4f85ed7d76794 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 15 Jan 2019 19:40:17 -0800 Subject: [PATCH 08/38] add placeholder for nodename --- installer/conf/telegraf.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 9cbb287f6..1c38bbcfe 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -75,7 +75,7 @@ logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" ## Override default hostname, if empty use os.Hostname() - hostname = "$nodename" + hostname = "placeholder_hostname" ## If set to true, do no set the "host" tag in the telegraf agent. omit_hostname = false From 53b302c7f7e018c878163e1042fbab0461e6fad1 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 15 Jan 2019 19:48:50 -0800 Subject: [PATCH 09/38] change namespace --- installer/conf/telegraf.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 1c38bbcfe..21d23fd71 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -90,7 +90,7 @@ # timeout = "20s" ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "Container.Insights3/" + namespace_prefix = "Insights.Container/" ## Azure Monitor doesn't have a string value type, so convert string ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows From dd7d618fe38cd9fdded687f34de056bd68ebda26 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 24 Jan 2019 14:44:47 -0800 Subject: [PATCH 10/38] updated config --- installer/conf/telegraf.conf | 87 +++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 6 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 21d23fd71..d94fcc4e4 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -90,7 +90,7 @@ # timeout = "20s" ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "Insights.Container/" + namespace_prefix = "Insights.Container2/" ## Azure Monitor doesn't have a string value type, so convert string ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows @@ -256,16 +256,43 @@ dest = "node" [[processors.rename.replace]] field = "bytes_recv" - dest = "networkBytesReceived" + dest = "networkBytesReceivedTotal" [[processors.rename.replace]] field = "bytes_sent" - dest = "networkBytesSent" + dest = "networkBytesSentTotal" [[processors.rename.replace]] field = "err_in" - dest = "networkErrorsIn" + dest = "networkErrorsInTotal" [[processors.rename.replace]] field = "err_out" - dest = "networkErrorsOut" + dest = "networkErrorsOutTotal" + [[processors.rename.replace]] + measurement = "diskio" + dest = "node" + [[processors.rename.replace]] + field = "iops_in_progress" + dest = "diskIopsInProgress" + [[processors.rename.replace]] + measurement = "kubernetes_pod_volume" + dest = "pod" + [[processors.rename.replace]] + field = "used_bytes" + dest = "podVolumeUsedBytes" + [[processors.rename.replace]] + field = "capacity_bytes" + dest = "podVolumeCapacityBytes" + [[processors.rename.replace]] + field = "available_bytes" + dest = "podVolumeAvailableBytes" + [[processors.rename.replace]] + measurement = "kubernetes_pod_network" + dest = "pod" + [[processors.rename.replace]] + field = "tx_errors" + dest = "podNetworkTxErrorsTotal" + [[processors.rename.replace]] + field = "rx_errors" + dest = "podNetworkRxErrorsTotal" # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" @@ -464,6 +491,54 @@ ## ignore_protocol_stats = true ## - fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + fieldpass = ["err_in", "err_out"] taginclude = ["interface","host"] +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb", "vd*"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + # + ## On systems which support it, device metadata can be added in the form of + ## tags. + ## Currently only Linux is supported via udev properties. You can view + ## available properties for a device by running: + ## 'udevadm info -q property -n /dev/sda' + # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] + # + ## Using the same metadata source as device_tags, you can also customize the + ## name of the device via templates. + ## The 'name_templates' parameter is a list of templates to try and apply to + ## the device. The template may contain variables in the form of '$PROPERTY' or + ## '${PROPERTY}'. The first template which does not contain any variables not + ## present for the device is used as the device name tag. + ## The typical use case is for LVM volumes, to get the VG/LV name instead of + ## the near-meaningless DM-0 name. + # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] + fieldpass = ["iops_in_progress"] + taginclude = ["name","host"] +# Read metrics from the kubernetes kubelet api +[[inputs.kubernetes]] + ## URL for the kubelet + #url = "http://1.1.1.1:10255" + url = "http://placeholder_nodeip:10255/stats/summary" + + ## Use bearer token for authorization + # bearer_token = /path/to/bearer/token + + ## Set response_timeout (default 5 seconds) + # response_timeout = "5s" + + ## Optional TLS Config + # tls_ca = /path/to/cafile + # tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + fieldpass = ["used_bytes", "capacity_bytes", "available_bytes", "tx_errors", "rx_errors" ] + taginclude = ["volume_name","host","namespace","pod_name",] From 702e5082396d56d0bc9174bfb24832aee7b89844 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Fri, 25 Jan 2019 12:58:52 -0800 Subject: [PATCH 11/38] fix uri --- installer/conf/telegraf.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index d94fcc4e4..d7071db8b 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -90,7 +90,7 @@ # timeout = "20s" ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "Insights.Container2/" + namespace_prefix = "Insights.Containers/" ## Azure Monitor doesn't have a string value type, so convert string ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows @@ -526,7 +526,7 @@ [[inputs.kubernetes]] ## URL for the kubelet #url = "http://1.1.1.1:10255" - url = "http://placeholder_nodeip:10255/stats/summary" + url = "http://placeholder_nodeip:10255" ## Use bearer token for authorization # bearer_token = /path/to/bearer/token From 34e374d2cd832b4c2dbe98d7d88a9bbc4dd40441 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 29 Jan 2019 09:55:26 -0800 Subject: [PATCH 12/38] fix azMon settings --- installer/conf/telegraf.conf | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index d7071db8b..85f438ca3 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -96,18 +96,25 @@ ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows ## a maximum of 10 dimensions so Telegraf will only send the first 10 ## alphanumeric dimensions. - strings_as_dimensions = false + strings_as_dimensions = true ## Both region and resource_id must be set or be available via the ## Instance Metadata service on Azure Virtual Machines. # ## Azure Region to publish metrics against. ## ex: region = "southcentralus" - #region = "westeurope" + region = "placeholder_region" # ## The Azure Resource ID against which metric will be logged, e.g. - ## ex: resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" - # resource_id = "" + #resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" + resource_id = "placeholder_resource_id" + + azure_tenant_id = "placeholder_azure_tenant_id" + + azure_client_id = "placeholder_azure_client_id" + + azure_client_secret = "placeholder_azure_client_secret" + From 0d1b3c13e6e9412b73a44c8ff0572d1e9a7778cc Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 29 Jan 2019 13:58:08 -0800 Subject: [PATCH 13/38] remove aad settings --- installer/conf/telegraf.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 85f438ca3..1eb0173e6 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -109,11 +109,11 @@ #resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" resource_id = "placeholder_resource_id" - azure_tenant_id = "placeholder_azure_tenant_id" + #azure_tenant_id = "placeholder_azure_tenant_id" - azure_client_id = "placeholder_azure_client_id" + #azure_client_id = "placeholder_azure_client_id" - azure_client_secret = "placeholder_azure_client_secret" + #azure_client_secret = "placeholder_azure_client_secret" From 36c46a29f687bb98967ab8748052fd89b9abdb56 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Wed, 30 Jan 2019 14:20:14 -0800 Subject: [PATCH 14/38] add custom metrics regions --- installer/conf/custom_metrics_regions.conf | 7 +++++++ installer/conf/telegraf.conf | 2 +- installer/datafiles/base_container.data | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 installer/conf/custom_metrics_regions.conf diff --git a/installer/conf/custom_metrics_regions.conf b/installer/conf/custom_metrics_regions.conf new file mode 100644 index 000000000..bf548abdd --- /dev/null +++ b/installer/conf/custom_metrics_regions.conf @@ -0,0 +1,7 @@ +eastus +southcentralus +westcentralus +westus2 +southeastasia +northeurope +westeurope \ No newline at end of file diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 1eb0173e6..216a7a250 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -90,7 +90,7 @@ # timeout = "20s" ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "Insights.Containers/" + namespace_prefix = "Insights.Container/" ## Azure Monitor doesn't have a string value type, so convert string ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 255e6ebfd..155f5ef8c 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -92,6 +92,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/custom_metrics_regions.conf; installer/conf/custom_metrics_regions.conf; 644; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root From d1fc7114f0f1d90bb3191b82bd2ee0fc63441315 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Wed, 30 Jan 2019 18:57:34 -0800 Subject: [PATCH 15/38] fix config --- installer/conf/telegraf.conf | 65 ++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 216a7a250..12a42d493 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -245,7 +245,7 @@ [[processors.rename]] [[processors.rename.replace]] measurement = "disk" - dest = "node" + dest = "nodes" [[processors.rename.replace]] field = "free" dest = "diskFreeBytes" @@ -260,7 +260,7 @@ dest = "diskUsedPercentage" [[processors.rename.replace]] measurement = "net" - dest = "node" + dest = "nodes" [[processors.rename.replace]] field = "bytes_recv" dest = "networkBytesReceivedTotal" @@ -275,13 +275,13 @@ dest = "networkErrorsOutTotal" [[processors.rename.replace]] measurement = "diskio" - dest = "node" + dest = "nodes" [[processors.rename.replace]] field = "iops_in_progress" dest = "diskIopsInProgress" [[processors.rename.replace]] measurement = "kubernetes_pod_volume" - dest = "pod" + dest = "pods" [[processors.rename.replace]] field = "used_bytes" dest = "podVolumeUsedBytes" @@ -293,13 +293,19 @@ dest = "podVolumeAvailableBytes" [[processors.rename.replace]] measurement = "kubernetes_pod_network" - dest = "pod" + dest = "pods" [[processors.rename.replace]] field = "tx_errors" dest = "podNetworkTxErrorsTotal" [[processors.rename.replace]] field = "rx_errors" dest = "podNetworkRxErrorsTotal" + [[processors.rename.replace]] + measurement = "docker" + dest = "containers" + [[processors.rename.replace]] + measurement = "docker_container_status" + dest = "containers" # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" @@ -548,4 +554,51 @@ ## Use TLS but skip chain & host verification # insecure_skip_verify = false fieldpass = ["used_bytes", "capacity_bytes", "available_bytes", "tx_errors", "rx_errors" ] - taginclude = ["volume_name","host","namespace","pod_name",] + taginclude = ["volume_name","host","namespace","pod_name"] +# Read metrics about docker containers +[[inputs.docker]] + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" + endpoint = "unix:///var/run/host/docker.sock" + + ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) + gather_services = false + + ## Only collect metrics for these containers, collect all if empty + container_names = [] + + ## Containers to include and exclude. Globs accepted. + ## Note that an empty array for both will include all containers + container_name_include = [] + container_name_exclude = [] + + ## Container states to include and exclude. Globs accepted. + ## When empty only containers in the "running" state will be captured. + container_state_include = ['*'] + # container_state_exclude = [] + + ## Timeout for docker list, info, and stats commands + timeout = "5s" + + ## Whether to report for each container per-device blkio (8:0, 8:1...) and + ## network (eth0, eth1, ...) stats or not + perdevice = true + ## Whether to report for each container total blkio and network stats or not + total = true + ## Which environment variables should we use as a tag + ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] + + ## docker labels to include and exclude as tags. Globs accepted. + ## Note that an empty array for both will include all labels as tags + docker_label_include = [] + docker_label_exclude = [] + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images" ,"oomkilled", "exitcode" ] + taginclude = ["host", "container_name", "container_status", "container_image"] \ No newline at end of file From e68bba5b07565c00e3521ffef916ee16cd8dc290 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 7 Feb 2019 20:13:31 -0800 Subject: [PATCH 16/38] add support for replica-set config --- installer/conf/telegraf-rs.conf | 528 ++++++++++++++++++++++++ installer/conf/telegraf.conf | 84 ++-- installer/datafiles/base_container.data | 1 + 3 files changed, 569 insertions(+), 44 deletions(-) create mode 100644 installer/conf/telegraf-rs.conf diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf new file mode 100644 index 000000000..fc8abfe26 --- /dev/null +++ b/installer/conf/telegraf-rs.conf @@ -0,0 +1,528 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" + # cluster = "$ACS_RESOURCE_NAME" + #node = $NODE_IP + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$AKS_RESOURCE_ID" + Region = "$AKS_REGION" + ClusterName = "$AKS_CLUSTER_NAME" + ClusterType = "AKS" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = true + ## Run telegraf in quiet mode (error log messages only). + quiet = false + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Send aggregate metrics to Azure Monitor +[[outputs.azure_monitor]] + ## Timeout for HTTP writes. + # timeout = "20s" + + ## Set the namespace prefix, defaults to "Telegraf/". + namespace_prefix = "Insights.Container/" + + ## Azure Monitor doesn't have a string value type, so convert string + ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows + ## a maximum of 10 dimensions so Telegraf will only send the first 10 + ## alphanumeric dimensions. + strings_as_dimensions = true + + ## Both region and resource_id must be set or be available via the + ## Instance Metadata service on Azure Virtual Machines. + # + ## Azure Region to publish metrics against. + ## ex: region = "southcentralus" + region = "placeholder_region" + # + ## The Azure Resource ID against which metric will be logged, e.g. + #resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" + resource_id = "placeholder_resource_id" + + #azure_tenant_id = "placeholder_azure_tenant_id" + + #azure_client_id = "placeholder_azure_client_id" + + #azure_client_secret = "placeholder_azure_client_secret" + + #namepass = ["nodes", "pods", "containers","prometheus"] + namedrop = ["filestat"] + tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +[[outputs.application_insights]] + ## Instrumentation key of the Application Insights resource. + instrumentation_key = "$APPLICATIONINSIGHTS_KEY" + + ## Timeout for closing (default: 5s). + # timeout = "5s" + + ## Enable additional diagnostic logging. + # enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + namepass = ["filestat"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Convert values to another metric value type +# [[processors.converter]] +# ## Tags to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.tags] +# string = ["device"] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] +# +# ## Fields to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.fields] +# tag = ["host"] +# string = [] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] + + +# # Map enum values according to given table. +# [[processors.enum]] +# [[processors.enum.mapping]] +# ## Name of the field to map +# field = "status" +# +# ## Destination field to be used for the mapped value. By default the source +# ## field is used, overwriting the original value. +# # dest = "status_code" +# +# ## Default value to be used for all values not contained in the mapping +# ## table. When unset, the unmodified value for the field will be used if no +# ## match is found. +# # default = 0 +# +# ## Table of mappings +# [processors.enum.mapping.value_mappings] +# green = 1 +# yellow = 2 +# red = 3 + + +# # Apply metric modifications using override semantics. +# [[processors.override]] +# ## All modifications on inputs and aggregators can be overridden: +# # name_override = "new_name" +# # name_prefix = "new_name_prefix" +# # name_suffix = "new_name_suffix" +# +# ## Tags to be added (all values must be strings) +# # [processors.override.tags] +# # additional_tag = "tag_value" + + +# # Parse a value in a specified field/tag(s) and add the result in a new metric +# [[processors.parser]] +# ## The name of the fields whose value will be parsed. +# parse_fields = [] +# +# ## If true, incoming metrics are not emitted. +# drop_original = false +# +# ## If set to override, emitted metrics will be merged by overriding the +# ## original metric using the newly parsed metrics. +# merge = "override" +# +# ## The dataformat to be read from files +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Print all metrics that pass through this filter. +# [[processors.printer]] + + +# # Transforms tag and field values with regex pattern +# [[processors.regex]] +# ## Tag and field conversions defined in a separate sub-tables +# # [[processors.regex.tags]] +# # ## Tag to change +# # key = "resp_code" +# # ## Regular expression to match on a tag value +# # pattern = "^(\\d)\\d\\d$" +# # ## Pattern for constructing a new value (${1} represents first subgroup) +# # replacement = "${1}xx" +# +# # [[processors.regex.fields]] +# # key = "request" +# # ## All the power of the Go regular expressions available here +# # ## For example, named subgroups +# # pattern = "^/api(?P/[\\w/]+)\\S*" +# # replacement = "${method}" +# # ## If result_key is present, a new field will be created +# # ## instead of changing existing field +# # result_key = "method" +# +# ## Multiple conversions may be applied for one field sequentially +# ## Let's extract one more value +# # [[processors.regex.fields]] +# # key = "request" +# # pattern = ".*category=(\\w+).*" +# # replacement = "${1}" +# # result_key = "search_category" + + +# # Rename measurements, tags, and fields that pass through this filter. +# [[processors.rename]] + + +# # Perform string processing on tags, fields, and measurements +[[processors.rename]] + [[processors.rename.replace]] + measurement = "kubernetes_daemonset" + dest = "daemonsets" + [[processors.rename.replace]] + measurement = "kubernetes_deployment" + dest = "deployments" + [[processors.rename.replace]] + measurement = "kubernetes_node" + dest = "nodes" + [[processors.rename.replace]] + tag = "node_name" + dest = "host" + [[processors.rename.replace]] + measurement = "kubernetes_pod_container" + dest = "containers" + +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_agregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurance of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +#[[inputs.cpu]] + ## Whether to report per-cpu stats or not +# percpu = false + ## Whether to report total system cpu stats or not +# totalcpu = true + ## If true, collect raw CPU time metrics. +# collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. +# report_active = true +# fieldpass = ["usage_active","cluster","node","host","device"] +# taginclude = ["cluster","cpu","node"] + + + + # Read metrics from one or many prometheus clients +#[[inputs.prometheus]] + ## An array of urls to scrape metrics from. +# urls = ["https://$METRICS_SERVER_SERVICE_HOST/metrics"] + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to 'https' & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization +# bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + ## Specify timeout duration for slower prometheus clients (default is 3s) +# response_timeout = "15s" + + ## Optional TLS Config +# tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + # tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification +# insecure_skip_verify = true + # Read stats about given file(s) +[[inputs.filestat]] + ## Files to gather stats about. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". See https://github.com/gobwas/glob. + files = ["/var/opt/microsoft/docker-cimprov/log/telegraf.log"] + ## If true, read the entire file and calculate an md5 checksum. + md5 = false +[[inputs.kube_inventory]] + ## URL for the Kubernetes API + #url = "https://127.0.0.1" + url = "$K8SSERVICEHOST" + + ## Namespace to use + # namespace = "default" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Set response_timeout (default 5 seconds) + response_timeout = "15s" + + ## Optional Resources to exclude from gathering + ## Leave them with blank with try to gather everything available. + ## Values can be - "daemonsets", deployments", "nodes", "persistentvolumes", + ## "persistentvolumeclaims", "pods", "statefulsets" + # resource_exclude = [ "deployments", "nodes", "statefulsets" ] + + ## Optional Resources to include when gathering + ## Overrides resource_exclude if both set. + # resource_include = [ "deployments", "nodes", "statefulsets" ] + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + # tls_cert = "/path/to/certfile" + # tls_key = "/path/to/keyfile" + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + + fieldpass = ["current_number_scheduled", "desired_number_scheduled", "number_available", "number_unavailable", "number_ready", "replicas_available", "replicas_unavailable", "capacity_cpu_cores", "capacity_memory_bytes", "capacity_pods", "allocatable_pods", "allocatable_cpu_cores", "allocatable_memory_bytes", "restarts_total","resource_requests_cpu_units", "resource_requests_memory_bytes", "resource_limits_cpu_units", "resource_limits_memory_bytes"] + taginclude = ["node_name", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name"] \ No newline at end of file diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 12a42d493..2652ae82b 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -22,6 +22,13 @@ # user = "$USER" # cluster = "$ACS_RESOURCE_NAME" #node = $NODE_IP + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$AKS_RESOURCE_ID" + Region = "$AKS_REGION" + ClusterName = "$AKS_CLUSTER_NAME" + ClusterType = "AKS" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" # Configuration for telegraf agent @@ -115,8 +122,28 @@ #azure_client_secret = "placeholder_azure_client_secret" + #namepass = ["nodes", "pods", "containers","prometheus"] + namedrop = ["filestat"] + tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] +[[outputs.application_insights]] + ## Instrumentation key of the Application Insights resource. + instrumentation_key = "$APPLICATIONINSIGHTS_KEY" + ## Timeout for closing (default: 5s). + # timeout = "5s" + + ## Enable additional diagnostic logging. + # enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + namepass = ["filestat"] ############################################################################### # PROCESSOR PLUGINS # @@ -252,9 +279,6 @@ [[processors.rename.replace]] field = "used" dest = "diskUsedBytes" - [[processors.rename.replace]] - field = "total" - dest = "diskTotalBytes" [[processors.rename.replace]] field = "used_percent" dest = "diskUsedPercentage" @@ -273,21 +297,12 @@ [[processors.rename.replace]] field = "err_out" dest = "networkErrorsOutTotal" - [[processors.rename.replace]] - measurement = "diskio" - dest = "nodes" - [[processors.rename.replace]] - field = "iops_in_progress" - dest = "diskIopsInProgress" [[processors.rename.replace]] measurement = "kubernetes_pod_volume" dest = "pods" [[processors.rename.replace]] field = "used_bytes" dest = "podVolumeUsedBytes" - [[processors.rename.replace]] - field = "capacity_bytes" - dest = "podVolumeCapacityBytes" [[processors.rename.replace]] field = "available_bytes" dest = "podVolumeAvailableBytes" @@ -306,6 +321,7 @@ [[processors.rename.replace]] measurement = "docker_container_status" dest = "containers" + # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" @@ -481,8 +497,8 @@ ## Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] - fieldpass = ["free", "used", "total", "used_percent"] - taginclude = ["device","fstype","mode","path","host"] + fieldpass = ["free", "used", "used_percent"] + taginclude = ["device","path","host"] # Read metrics about memory usage @@ -508,33 +524,6 @@ fieldpass = ["err_in", "err_out"] taginclude = ["interface","host"] -# Read metrics about disk IO by device -[[inputs.diskio]] - ## By default, telegraf will gather stats for all devices including - ## disk partitions. - ## Setting devices will restrict the stats to the specified devices. - # devices = ["sda", "sdb", "vd*"] - ## Uncomment the following line if you need disk serial numbers. - # skip_serial_number = false - # - ## On systems which support it, device metadata can be added in the form of - ## tags. - ## Currently only Linux is supported via udev properties. You can view - ## available properties for a device by running: - ## 'udevadm info -q property -n /dev/sda' - # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] - # - ## Using the same metadata source as device_tags, you can also customize the - ## name of the device via templates. - ## The 'name_templates' parameter is a list of templates to try and apply to - ## the device. The template may contain variables in the form of '$PROPERTY' or - ## '${PROPERTY}'. The first template which does not contain any variables not - ## present for the device is used as the device name tag. - ## The typical use case is for LVM volumes, to get the VG/LV name instead of - ## the near-meaningless DM-0 name. - # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] - fieldpass = ["iops_in_progress"] - taginclude = ["name","host"] # Read metrics from the kubernetes kubelet api [[inputs.kubernetes]] ## URL for the kubelet @@ -553,7 +542,7 @@ # tls_key = /path/to/keyfile ## Use TLS but skip chain & host verification # insecure_skip_verify = false - fieldpass = ["used_bytes", "capacity_bytes", "available_bytes", "tx_errors", "rx_errors" ] + fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] taginclude = ["volume_name","host","namespace","pod_name"] # Read metrics about docker containers [[inputs.docker]] @@ -600,5 +589,12 @@ # tls_key = "/etc/telegraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = false - fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images" ,"oomkilled", "exitcode" ] - taginclude = ["host", "container_name", "container_status", "container_image"] \ No newline at end of file + fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images" ,"oomkilled" ] + taginclude = ["host", "container_name", "container_status", "container_image"] +[[inputs.filestat]] + ## Files to gather stats about. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". See https://github.com/gobwas/glob. + files = ["/var/opt/microsoft/docker-cimprov/log/telegraf.log"] + ## If true, read the entire file and calculate an md5 checksum. + md5 = false diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 155f5ef8c..f3075c64d 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -92,6 +92,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/custom_metrics_regions.conf; installer/conf/custom_metrics_regions.conf; 644; root; root %Links From 185c4cafc867f34073a6bc4b2fc8f80bbc6e93cf Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 7 Feb 2019 23:42:07 -0800 Subject: [PATCH 17/38] fix oomkilled --- installer/conf/telegraf.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 2652ae82b..6dfcdaeac 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -589,7 +589,7 @@ # tls_key = "/etc/telegraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = false - fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images" ,"oomkilled" ] + fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] taginclude = ["host", "container_name", "container_status", "container_image"] [[inputs.filestat]] ## Files to gather stats about. From e76ef3af26dc88be2cc71bfda8f647a1430a4e26 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 01:53:02 -0800 Subject: [PATCH 18/38] Add telegraf 403 metric telemetry & non 403 trace telemetry --- installer/conf/td-agent-bit-rs.conf | 25 ++++++++++++++++++++ installer/conf/td-agent-bit.conf | 18 ++++++++++++--- installer/conf/telegraf-rs.conf | 29 ++++++++++++++++++++---- installer/conf/telegraf.conf | 28 +++++++++++++++++++---- installer/datafiles/base_container.data | 2 ++ installer/scripts/Telegraf403Telemery.sh | 3 +++ source/code/go/src/plugins/out_oms.go | 12 ++++++---- source/code/go/src/plugins/telemetry.go | 6 +++-- 8 files changed, 105 insertions(+), 18 deletions(-) create mode 100644 installer/conf/td-agent-bit-rs.conf create mode 100644 installer/scripts/Telegraf403Telemery.sh diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf new file mode 100644 index 000000000..19239708b --- /dev/null +++ b/installer/conf/td-agent-bit-rs.conf @@ -0,0 +1,25 @@ +[SERVICE] + Flush 30 + Log_Level info + Parsers_File /etc/td-agent-bit/parsers.conf + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log + +[INPUT] + Name tail + Tag oms.container.log.telegraf.err.* + Path /var/opt/microsoft/docker-cimprov/log/telegraf.log + DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db + Mem_Buf_Limit 30m + Path_Key filepath + Skip_Long_Lines On + +[FILTER] + Name grep + Match oms.container.log.telegraf.err.* + Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ + +[OUTPUT] + Name oms + EnableTelemetry true + TelemetryPushIntervalSeconds 300 + Match oms.container.log.* \ No newline at end of file diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 29c98bdf1..9771a4c96 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -23,10 +23,22 @@ Path_Key filepath Skip_Long_Lines On +[INPUT] + Name tail + Tag oms.container.log.telegraf.err.* + Path /var/opt/microsoft/docker-cimprov/log/telegraf.log + DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db + Mem_Buf_Limit 30m + Path_Key filepath + Skip_Long_Lines On + +[FILTER] + Name grep + Match oms.container.log.telegraf.err.* + Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ + [OUTPUT] Name oms EnableTelemetry true TelemetryPushIntervalSeconds 300 - Match oms.container.log.* - AgentVersion ciprod01092019 - + Match oms.container.log.* \ No newline at end of file diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index fc8abfe26..85d2d008b 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -74,9 +74,9 @@ ## Logging configuration: ## Run telegraf with debug log messages. - debug = true + debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = false + quiet = true ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" @@ -122,7 +122,7 @@ #azure_client_secret = "placeholder_azure_client_secret" #namepass = ["nodes", "pods", "containers","prometheus"] - namedrop = ["filestat"] + namedrop = ["filestat", "telegraf_telemetry"] tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] [[outputs.application_insights]] @@ -142,7 +142,7 @@ # [outputs.application_insights.context_tag_sources] # "ai.cloud.role" = "kubernetes_container_name" # "ai.cloud.roleInstance" = "kubernetes_pod_name" - namepass = ["filestat"] + namepass = ["filestat", "telegraf_telemetry"] ############################################################################### # PROCESSOR PLUGINS # @@ -486,6 +486,7 @@ # Read stats about given file(s) [[inputs.filestat]] ## Files to gather stats about. + interval = "15m" ## These accept standard unix glob matching rules, but with the addition of ## ** as a "super asterisk". See https://github.com/gobwas/glob. files = ["/var/opt/microsoft/docker-cimprov/log/telegraf.log"] @@ -525,4 +526,22 @@ insecure_skip_verify = true fieldpass = ["current_number_scheduled", "desired_number_scheduled", "number_available", "number_unavailable", "number_ready", "replicas_available", "replicas_unavailable", "capacity_cpu_cores", "capacity_memory_bytes", "capacity_pods", "allocatable_pods", "allocatable_cpu_cores", "allocatable_memory_bytes", "restarts_total","resource_requests_cpu_units", "resource_requests_memory_bytes", "resource_limits_cpu_units", "resource_limits_memory_bytes"] - taginclude = ["node_name", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name"] \ No newline at end of file + taginclude = ["node_name", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name"] +[[inputs.exec]] + ## Commands array + interval = "15m" + commands = [ + "/opt/microsoft/docker-cimprov/bin/Telegraf403Telemetry.sh" + ] + + ## Timeout for each command to complete. + timeout = "15s" + + ## measurement name suffix (for separating different commands) + name_suffix = "_telemetry" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" \ No newline at end of file diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 6dfcdaeac..e35bb1cd0 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -75,9 +75,9 @@ ## Logging configuration: ## Run telegraf with debug log messages. - debug = true + debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = false + quiet = true ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" @@ -123,7 +123,7 @@ #azure_client_secret = "placeholder_azure_client_secret" #namepass = ["nodes", "pods", "containers","prometheus"] - namedrop = ["filestat"] + namedrop = ["filestat", "telegraf_telemetry"] tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] [[outputs.application_insights]] @@ -143,7 +143,7 @@ # [outputs.application_insights.context_tag_sources] # "ai.cloud.role" = "kubernetes_container_name" # "ai.cloud.roleInstance" = "kubernetes_pod_name" - namepass = ["filestat"] + namepass = ["filestat", "telegraf_telemetry"] ############################################################################### # PROCESSOR PLUGINS # @@ -593,8 +593,28 @@ taginclude = ["host", "container_name", "container_status", "container_image"] [[inputs.filestat]] ## Files to gather stats about. + interval = "15m" ## These accept standard unix glob matching rules, but with the addition of ## ** as a "super asterisk". See https://github.com/gobwas/glob. files = ["/var/opt/microsoft/docker-cimprov/log/telegraf.log"] ## If true, read the entire file and calculate an md5 checksum. md5 = false +[[inputs.exec]] + ## Commands array + interval = "15m" + commands = [ + "/opt/microsoft/docker-cimprov/bin/Telegraf403Telemetry.sh" + ] + + ## Timeout for each command to complete. + timeout = "15s" + + ## measurement name suffix (for separating different commands) + name_suffix = "_telemetry" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index f3075c64d..e7a3323aa 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -90,10 +90,12 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; installer/conf/td-agent-bit-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/custom_metrics_regions.conf; installer/conf/custom_metrics_regions.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/Telegraf403Telemetry.sh; installer/scripts/Telegraf403Telemetry.sh; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root diff --git a/installer/scripts/Telegraf403Telemery.sh b/installer/scripts/Telegraf403Telemery.sh new file mode 100644 index 000000000..3754538c6 --- /dev/null +++ b/installer/scripts/Telegraf403Telemery.sh @@ -0,0 +1,3 @@ +#!/bin/sh +count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l) +echo "telegraf,AKS_RESOURCE_ID=$AKS_RESOURCE_ID, 403count=$count403" \ No newline at end of file diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 133e0f039..7747fc7bb 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -2,11 +2,13 @@ package main import ( "github.com/fluent/fluent-bit-go/output" + "github.com/Microsoft/ApplicationInsights-Go/appinsights" ) import ( "C" "strings" "unsafe" + "os" ) //export FLBPluginRegister @@ -19,7 +21,7 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { // ctx (context) pointer to fluentbit context (state/ c code) func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") - agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") + agentVersion := os.Getenv("AGENT_VERSION") InitializePlugin(ContainerLogPluginConfFilePath, agentVersion) enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { @@ -51,9 +53,11 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { records = append(records, record) } - incomingTag := C.GoString(tag) - if strings.Contains(strings.ToLower(incomingTag), "oms.container.log.flbplugin") { - return PushToAppInsightsTraces(records) + incomingTag := strings.ToLower(C.GoString(tag)) + if strings.Contains(incomingTag, "oms.container.log.flbplugin") { + return PushToAppInsightsTraces(records, appinsights.Information, incomingTag) + } else if strings.Contains(incomingTag, "oms.container.log.telegraf.err") { + return PushToAppInsightsTraces(records, appinsights.Error, incomingTag) } return PostDataHelper(records) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 82f970d3a..9e8dd057c 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -159,13 +159,15 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } // PushToAppInsightsTraces sends the log lines as trace messages to the configured App Insights Instance -func PushToAppInsightsTraces(records []map[interface{}]interface{}) int { +func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel int, tag string) int { var logLines []string for _, record := range records { logLines = append(logLines, ToString(record["log"])) } traceEntry := strings.Join(logLines, "\n") - TelemetryClient.TrackTrace(traceEntry, 1) + traceTelemetryItem := appinsights.NewTraceTelemetry(traceEntry, severityLevel) + traceTelemetryItem.Properties["tag"] = tag + TelemetryClient.Track(traceTelemetryItem) return output.FLB_OK } From 57a2797cb24b021e3bdda118b24f645eec198c50 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 02:01:00 -0800 Subject: [PATCH 19/38] fix type --- source/code/go/src/plugins/telemetry.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 9e8dd057c..03a787354 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -159,7 +159,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } // PushToAppInsightsTraces sends the log lines as trace messages to the configured App Insights Instance -func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel int, tag string) int { +func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel contracts.SeverityLevel, tag string) int { var logLines []string for _, record := range records { logLines = append(logLines, ToString(record["log"])) From 68db7f3cf4684dde41286afa81b29b0122195460 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 02:04:47 -0800 Subject: [PATCH 20/38] fix package --- source/code/go/src/plugins/telemetry.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 03a787354..0b2feec59 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -159,7 +159,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } // PushToAppInsightsTraces sends the log lines as trace messages to the configured App Insights Instance -func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel contracts.SeverityLevel, tag string) int { +func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel appinsights.contracts.SeverityLevel, tag string) int { var logLines []string for _, record := range records { logLines = append(logLines, ToString(record["log"])) From 1d2bd74e6da4b16ffdde0c73b2317ec4d38f8275 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 02:09:31 -0800 Subject: [PATCH 21/38] fix package import --- source/code/go/src/plugins/telemetry.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 0b2feec59..acf97042b 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -9,6 +9,7 @@ import ( "time" "github.com/Microsoft/ApplicationInsights-Go/appinsights" + "github.com/Microsoft/ApplicationInsights-Go/appinsights/contracts" "github.com/fluent/fluent-bit-go/output" ) @@ -159,7 +160,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } // PushToAppInsightsTraces sends the log lines as trace messages to the configured App Insights Instance -func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel appinsights.contracts.SeverityLevel, tag string) int { +func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel contracts.SeverityLevel, tag string) int { var logLines []string for _, record := range records { logLines = append(logLines, ToString(record["log"])) From 9fa30b7dad1de8a0d473693eae8108b971caa4f4 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 02:16:24 -0800 Subject: [PATCH 22/38] fix filename --- installer/scripts/Telegraf403Telemetry.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 installer/scripts/Telegraf403Telemetry.sh diff --git a/installer/scripts/Telegraf403Telemetry.sh b/installer/scripts/Telegraf403Telemetry.sh new file mode 100644 index 000000000..3754538c6 --- /dev/null +++ b/installer/scripts/Telegraf403Telemetry.sh @@ -0,0 +1,3 @@ +#!/bin/sh +count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l) +echo "telegraf,AKS_RESOURCE_ID=$AKS_RESOURCE_ID, 403count=$count403" \ No newline at end of file From 7979e7c5e6887e3a52f85936303970b87e7c4125 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 09:18:10 -0800 Subject: [PATCH 23/38] delete unused file --- installer/scripts/Telegraf403Telemery.sh | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 installer/scripts/Telegraf403Telemery.sh diff --git a/installer/scripts/Telegraf403Telemery.sh b/installer/scripts/Telegraf403Telemery.sh deleted file mode 100644 index 3754538c6..000000000 --- a/installer/scripts/Telegraf403Telemery.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l) -echo "telegraf,AKS_RESOURCE_ID=$AKS_RESOURCE_ID, 403count=$count403" \ No newline at end of file From 95e7b9945e74298157ee51f30cce9eafe00bfad8 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 12:31:31 -0800 Subject: [PATCH 24/38] conf file for rs; fix 403counttotal metric for telegraf, remove host and use nodeName consistently, rename metrics --- installer/conf/out_oms-rs.conf | 6 ++ installer/conf/telegraf-rs.conf | 88 +++++++++++++++++++++-- installer/conf/telegraf.conf | 39 ++++++++-- installer/datafiles/base_container.data | 1 + installer/scripts/Telegraf403Telemetry.sh | 4 +- source/code/go/src/plugins/oms.go | 3 +- source/code/go/src/plugins/out_oms.go | 8 ++- 7 files changed, 132 insertions(+), 17 deletions(-) create mode 100644 installer/conf/out_oms-rs.conf diff --git a/installer/conf/out_oms-rs.conf b/installer/conf/out_oms-rs.conf new file mode 100644 index 000000000..e3a32a526 --- /dev/null +++ b/installer/conf/out_oms-rs.conf @@ -0,0 +1,6 @@ +omsadmin_conf_path=/etc/opt/microsoft/omsagent/conf/omsadmin.conf +cert_file_path=/etc/opt/microsoft/omsagent/certs/oms.crt +key_file_path=/etc/opt/microsoft/omsagent/certs/oms.key +container_host_file_path=/var/opt/microsoft/docker-cimprov/state/containerhostname +container_inventory_refresh_interval=86400 +kube_system_containers_refresh_interval=86400 diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index 85d2d008b..b01f380e2 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -275,16 +275,91 @@ [[processors.rename.replace]] measurement = "kubernetes_deployment" dest = "deployments" + [[processors.rename.replace]] + measurement = "kubernetes_deployment" + dest = "statefulsets" [[processors.rename.replace]] measurement = "kubernetes_node" dest = "nodes" - [[processors.rename.replace]] - tag = "node_name" - dest = "host" [[processors.rename.replace]] measurement = "kubernetes_pod_container" dest = "containers" - + [[processors.rename.replace]] + field = "current_number_scheduled" + dest = "currentNumberScheduled" + [[processors.rename.replace]] + field = "desired_number_scheduled" + dest = "desiredNumberScheduled" + [[processors.rename.replace]] + field = "number_available" + dest = "numberAvailable" + [[processors.rename.replace]] + field = "number_unavailable" + dest = "numUnavailable" + [[processors.rename.replace]] + field = "number_ready" + dest = "numReady" + [[processors.rename.replace]] + field = "replicas_available" + dest = "numReplicasAvailable" + [[processors.rename.replace]] + field = "replicas_unavailable" + dest = "numReplicasUnavailable" + [[processors.rename.replace]] + field = "capacity_cpu_cores" + dest = "capacityCpuCores" + [[processors.rename.replace]] + field = "capacity_memory_bytes" + dest = "capacityMemoryBytes" + [[processors.rename.replace]] + field = "capacity_pods" + dest = "capacityNumPods" + [[processors.rename.replace]] + field = "allocatable_pods" + dest = "allocatableNumPods" + [[processors.rename.replace]] + field = "allocatable_cpu_cores" + dest = "allocatableCpuCores" + [[processors.rename.replace]] + field = "allocatable_memory_bytes" + dest = "allocatableMemoryBytes" + [[processors.rename.replace]] + field = "restarts_total" + dest = "restartsTotal" + [[processors.rename.replace]] + field = "resource_requests_cpu_units" + dest = "resourceRequestsCpuUnits" + [[processors.rename.replace]] + field = "resource_requests_memory_bytes" + dest = "resourceRequestsMemoryBytes" + [[processors.rename.replace]] + field = "resource_limits_cpu_units" + dest = "resourceLimitsCpuUnits" + [[processors.rename.replace]] + field = "resource_limits_memory_bytes" + dest = "resourceLimitsMemoryBytes" + [[processors.rename.replace]] + field = "spec_replicas" + dest = "numSpecReplicas" + [[processors.rename.replace]] + field = "replicas_current" + dest = "numCurrentReplicas" + [[processors.rename.replace]] + field = "replicas_ready" + dest = "numReadyReplicas" + [[processors.rename.replace]] + tag = "daemonset_name" + dest = "daemonsetName" + [[processors.rename.replace]] + tag = "deployment_name" + dest = "deploymentName" + [[processors.rename.replace]] + tag = "container_name" + dest = "containerName" + [[processors.rename.replace]] + tag = "node_name" + dest = "nodeName" + # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" @@ -525,8 +600,9 @@ ## Use TLS but skip chain & host verification insecure_skip_verify = true - fieldpass = ["current_number_scheduled", "desired_number_scheduled", "number_available", "number_unavailable", "number_ready", "replicas_available", "replicas_unavailable", "capacity_cpu_cores", "capacity_memory_bytes", "capacity_pods", "allocatable_pods", "allocatable_cpu_cores", "allocatable_memory_bytes", "restarts_total","resource_requests_cpu_units", "resource_requests_memory_bytes", "resource_limits_cpu_units", "resource_limits_memory_bytes"] - taginclude = ["node_name", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name"] + namepass = ["kubernetes_daemonset", "kubernetes_deployment", "kubernetes_node", "kubernetes_pod_container", "kubernetes_statefulset"] + fieldpass = ["current_number_scheduled", "desired_number_scheduled", "number_available", "number_unavailable", "number_ready", "replicas_available", "replicas_unavailable", "capacity_cpu_cores", "capacity_memory_bytes", "capacity_pods", "allocatable_pods", "allocatable_cpu_cores", "allocatable_memory_bytes", "restarts_total","resource_requests_cpu_units", "resource_requests_memory_bytes", "resource_limits_cpu_units", "resource_limits_memory_bytes" , "spec_replicas", "replicas_current", "replicas_ready"] + taginclude = ["nodeName", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name"] [[inputs.exec]] ## Commands array interval = "15m" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index e35bb1cd0..a9b27993c 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -30,6 +30,8 @@ Computer = "placeholder_hostname" ControllerType = "$CONTROLLER_TYPE" + nodeName = "placeholder_hostname" + # Configuration for telegraf agent [agent] @@ -82,9 +84,9 @@ logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" ## Override default hostname, if empty use os.Hostname() - hostname = "placeholder_hostname" + #hostname = "placeholder_hostname" ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = false + omit_hostname = true ############################################################################### @@ -144,6 +146,7 @@ # "ai.cloud.role" = "kubernetes_container_name" # "ai.cloud.roleInstance" = "kubernetes_pod_name" namepass = ["filestat", "telegraf_telemetry"] + tagDrop = ["nodeName"] ############################################################################### # PROCESSOR PLUGINS # @@ -315,13 +318,34 @@ [[processors.rename.replace]] field = "rx_errors" dest = "podNetworkRxErrorsTotal" + [[processors.rename.replace]] + tag = "volume_name" + dest = "volumeName" + [[processors.rename.replace]] + tag = "pod_name" + dest = "podName" [[processors.rename.replace]] measurement = "docker" dest = "containers" [[processors.rename.replace]] measurement = "docker_container_status" dest = "containers" - + [[processors.rename.replace]] + field = "n_containers" + dest = "numContainers" + [[processors.rename.replace]] + field = "n_containers_running" + dest = "numContainersRunning" + [[processors.rename.replace]] + field = "n_containers_stopped" + dest = "numContainersStopped" + [[processors.rename.replace]] + field = "n_containers_paused" + dest = "numContainersPaused" + [[processors.rename.replace]] + field = "n_images" + dest = "numContainerImages" + # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] # # tag = "method" @@ -498,7 +522,7 @@ ## Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] fieldpass = ["free", "used", "used_percent"] - taginclude = ["device","path","host"] + taginclude = ["device","path","nodeName"] # Read metrics about memory usage @@ -522,7 +546,7 @@ ## #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] fieldpass = ["err_in", "err_out"] - taginclude = ["interface","host"] + taginclude = ["interface","nodeName"] # Read metrics from the kubernetes kubelet api [[inputs.kubernetes]] @@ -543,7 +567,7 @@ ## Use TLS but skip chain & host verification # insecure_skip_verify = false fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] - taginclude = ["volume_name","host","namespace","pod_name"] + taginclude = ["volume_name","nodeName","namespace","pod_name"] # Read metrics about docker containers [[inputs.docker]] ## Docker Endpoint @@ -590,7 +614,8 @@ ## Use TLS but skip chain & host verification # insecure_skip_verify = false fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] - taginclude = ["host", "container_name", "container_status", "container_image"] + #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] + taginclude = ["nodeName"] [[inputs.filestat]] ## Files to gather stats about. interval = "15m" diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index e7a3323aa..e6e6401d2 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -92,6 +92,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; installer/conf/td-agent-bit-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf; installer/conf/out_oms-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/custom_metrics_regions.conf; installer/conf/custom_metrics_regions.conf; 644; root; root diff --git a/installer/scripts/Telegraf403Telemetry.sh b/installer/scripts/Telegraf403Telemetry.sh index 3754538c6..f4476d9fd 100644 --- a/installer/scripts/Telegraf403Telemetry.sh +++ b/installer/scripts/Telegraf403Telemetry.sh @@ -1,3 +1,3 @@ #!/bin/sh -count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l) -echo "telegraf,AKS_RESOURCE_ID=$AKS_RESOURCE_ID, 403count=$count403" \ No newline at end of file +count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') +echo "telegraf,AKS_RESOURCE_ID=$AKS_RESOURCE_ID, 403countTotal=$count403" \ No newline at end of file diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 5d9269d1e..bf14ac5e6 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -26,7 +26,8 @@ import ( const DataType = "CONTAINER_LOG_BLOB" // ContainerLogPluginConfFilePath --> config file path for container log plugin -const ContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" +const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" +const ReplicaSetSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" // IPName for Container Log const IPName = "Containers" diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 7747fc7bb..059a93fe9 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -22,7 +22,13 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") agentVersion := os.Getenv("AGENT_VERSION") - InitializePlugin(ContainerLogPluginConfFilePath, agentVersion) + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "replicaset") == 0) { + Log("Using %s for plugin config \n", ReplicaSetContainerLogPluginConfFilePath) + InitializePlugin(ReplicaSetContainerLogPluginConfFilePath, agentVersion) + } else { + Log("Using %s for plugin config \n", DaemonSetContainerLogPluginConfFilePath) + InitializePlugin(DaemonSetContainerLogPluginConfFilePath, agentVersion) + } enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushIntervalSeconds") From 50d8572391adff44e19edc266ae3cd08fb15c788 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 12:37:01 -0800 Subject: [PATCH 25/38] fix statefulsets --- installer/conf/telegraf-rs.conf | 5 ++++- installer/conf/telegraf.conf | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index b01f380e2..ece3152ff 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -356,6 +356,9 @@ [[processors.rename.replace]] tag = "container_name" dest = "containerName" + [[processors.rename.replace]] + tag = "statefulset_name" + dest = "statefulsetName" [[processors.rename.replace]] tag = "node_name" dest = "nodeName" @@ -602,7 +605,7 @@ namepass = ["kubernetes_daemonset", "kubernetes_deployment", "kubernetes_node", "kubernetes_pod_container", "kubernetes_statefulset"] fieldpass = ["current_number_scheduled", "desired_number_scheduled", "number_available", "number_unavailable", "number_ready", "replicas_available", "replicas_unavailable", "capacity_cpu_cores", "capacity_memory_bytes", "capacity_pods", "allocatable_pods", "allocatable_cpu_cores", "allocatable_memory_bytes", "restarts_total","resource_requests_cpu_units", "resource_requests_memory_bytes", "resource_limits_cpu_units", "resource_limits_memory_bytes" , "spec_replicas", "replicas_current", "replicas_ready"] - taginclude = ["nodeName", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name"] + taginclude = ["nodeName", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name","statefulset_name"] [[inputs.exec]] ## Commands array interval = "15m" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index a9b27993c..4fd57c1b5 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -330,19 +330,19 @@ [[processors.rename.replace]] measurement = "docker_container_status" dest = "containers" - [[processors.rename.replace]] + [[processors.rename.replace]] field = "n_containers" dest = "numContainers" - [[processors.rename.replace]] + [[processors.rename.replace]] field = "n_containers_running" dest = "numContainersRunning" - [[processors.rename.replace]] + [[processors.rename.replace]] field = "n_containers_stopped" dest = "numContainersStopped" - [[processors.rename.replace]] + [[processors.rename.replace]] field = "n_containers_paused" dest = "numContainersPaused" - [[processors.rename.replace]] + [[processors.rename.replace]] field = "n_images" dest = "numContainerImages" From 2f8f4bfc7f8a55656f6e2c7cead3290143f3f4e8 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 12:46:53 -0800 Subject: [PATCH 26/38] fix typo. --- source/code/go/src/plugins/out_oms.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 059a93fe9..90ecaf15a 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -22,7 +22,7 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") agentVersion := os.Getenv("AGENT_VERSION") - if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "replicaset") == 0) { + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "replicaset") == 0 { Log("Using %s for plugin config \n", ReplicaSetContainerLogPluginConfFilePath) InitializePlugin(ReplicaSetContainerLogPluginConfFilePath, agentVersion) } else { From dd12b3d77f9d9a2b061867ee0fc9fa7ff6d7d0b1 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 12:50:25 -0800 Subject: [PATCH 27/38] fix another typo. --- source/code/go/src/plugins/oms.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index bf14ac5e6..d58a33b55 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -27,7 +27,7 @@ const DataType = "CONTAINER_LOG_BLOB" // ContainerLogPluginConfFilePath --> config file path for container log plugin const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" -const ReplicaSetSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" +const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" // IPName for Container Log const IPName = "Containers" From 49c251fa9b5571ae4945b87f5a8a3f17744f41c4 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 13:07:48 -0800 Subject: [PATCH 28/38] fix telemetry --- installer/scripts/Telegraf403Telemetry.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/scripts/Telegraf403Telemetry.sh b/installer/scripts/Telegraf403Telemetry.sh index f4476d9fd..cfa996dda 100644 --- a/installer/scripts/Telegraf403Telemetry.sh +++ b/installer/scripts/Telegraf403Telemetry.sh @@ -1,3 +1,3 @@ #!/bin/sh count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') -echo "telegraf,AKS_RESOURCE_ID=$AKS_RESOURCE_ID, 403countTotal=$count403" \ No newline at end of file +echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID}, 403countTotal=${count403}i" \ No newline at end of file From 2a3ef70cf26dacbf347ceb353d9d0ab05002c751 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 13:31:22 -0800 Subject: [PATCH 29/38] fix casing issue --- installer/conf/telegraf.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 4fd57c1b5..99b271ac4 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -146,7 +146,7 @@ # "ai.cloud.role" = "kubernetes_container_name" # "ai.cloud.roleInstance" = "kubernetes_pod_name" namepass = ["filestat", "telegraf_telemetry"] - tagDrop = ["nodeName"] + tagdrop = ["nodeName"] ############################################################################### # PROCESSOR PLUGINS # From 63426d235b6afbeeae8ba1c09d0dbf364851ee88 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 14:31:51 -0800 Subject: [PATCH 30/38] fix comma issue. --- installer/scripts/Telegraf403Telemetry.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/scripts/Telegraf403Telemetry.sh b/installer/scripts/Telegraf403Telemetry.sh index cfa996dda..3022ceaf0 100644 --- a/installer/scripts/Telegraf403Telemetry.sh +++ b/installer/scripts/Telegraf403Telemetry.sh @@ -1,3 +1,3 @@ #!/bin/sh count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') -echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID}, 403countTotal=${count403}i" \ No newline at end of file +echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID} 403countTotal=${count403}i" \ No newline at end of file From 6063f7964b6fd5ee8782dd41da4d2fb4de89eb0e Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Sun, 10 Feb 2019 17:28:59 -0800 Subject: [PATCH 31/38] disable telemetry for rs ; fix stateful set name --- installer/conf/td-agent-bit-rs.conf | 2 +- installer/conf/telegraf-rs.conf | 2 +- source/code/go/src/plugins/telemetry.go | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf index 19239708b..03d97657e 100644 --- a/installer/conf/td-agent-bit-rs.conf +++ b/installer/conf/td-agent-bit-rs.conf @@ -20,6 +20,6 @@ [OUTPUT] Name oms - EnableTelemetry true + EnableTelemetry false TelemetryPushIntervalSeconds 300 Match oms.container.log.* \ No newline at end of file diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index ece3152ff..b749b5ad9 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -276,7 +276,7 @@ measurement = "kubernetes_deployment" dest = "deployments" [[processors.rename.replace]] - measurement = "kubernetes_deployment" + measurement = "kubernetes_statefulset" dest = "statefulsets" [[processors.rename.replace]] measurement = "kubernetes_node" diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index acf97042b..bbc7be5eb 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -33,8 +33,6 @@ var ( const ( clusterTypeACS = "ACS" clusterTypeAKS = "AKS" - controllerTypeDaemonSet = "DaemonSet" - controllerTypeReplicaSet = "ReplicaSet" envAKSResourceID = "AKS_RESOURCE_ID" envACSResourceName = "ACS_RESOURCE_NAME" envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" @@ -125,7 +123,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { CommonProperties = make(map[string]string) CommonProperties["Computer"] = Computer CommonProperties["WorkspaceID"] = WorkspaceID - CommonProperties["ControllerType"] = controllerTypeDaemonSet + CommonProperties["ControllerType"] = os.Getenv("CONTROLLER_TYPE") CommonProperties["AgentVersion"] = agentVersion aksResourceID := os.Getenv(envAKSResourceID) From c8965463a06a7efd044d6e2b1b095e596da12c30 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Mon, 11 Feb 2019 17:04:53 -0800 Subject: [PATCH 32/38] worksround for namespace fix --- installer/conf/telegraf-rs.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index b749b5ad9..fea446345 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -576,7 +576,7 @@ url = "$K8SSERVICEHOST" ## Namespace to use - # namespace = "default" + namespace = "" ## Use bearer token for authorization. ('bearer_token' takes priority) bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" From a1991ce8ef27994f354b94954523852e4c6634aa Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Fri, 15 Mar 2019 18:00:50 -0700 Subject: [PATCH 33/38] telegraf integration - v1 --- installer/conf/td-agent-bit-rs.conf | 3 +- installer/conf/td-agent-bit.conf | 23 +++- installer/conf/telegraf.conf | 49 ++++++-- source/code/go/src/plugins/oms.go | 166 ++++++++++++++++++++++++++ source/code/go/src/plugins/out_oms.go | 2 + 5 files changed, 231 insertions(+), 12 deletions(-) diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf index 03d97657e..7993e7528 100644 --- a/installer/conf/td-agent-bit-rs.conf +++ b/installer/conf/td-agent-bit-rs.conf @@ -9,9 +9,10 @@ Tag oms.container.log.telegraf.err.* Path /var/opt/microsoft/docker-cimprov/log/telegraf.log DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - Mem_Buf_Limit 30m + Mem_Buf_Limit 2m Path_Key filepath Skip_Long_Lines On + Ignore_Older 1h [FILTER] Name grep diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 9771a4c96..8f2e5b5cd 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -10,27 +10,38 @@ Path /var/log/containers/*.log DB /var/log/omsagent-fblogs.db Parser docker - Mem_Buf_Limit 30m + Mem_Buf_Limit 5m Path_Key filepath Skip_Long_Lines On + Ignore_Older 5m [INPUT] Name tail Tag oms.container.log.flbplugin.* Path /var/log/containers/omsagent*.log DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db - Mem_Buf_Limit 30m + Mem_Buf_Limit 2m Path_Key filepath Skip_Long_Lines On + Ignore_Older 5m [INPUT] Name tail Tag oms.container.log.telegraf.err.* Path /var/opt/microsoft/docker-cimprov/log/telegraf.log DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - Mem_Buf_Limit 30m + Mem_Buf_Limit 2m Path_Key filepath Skip_Long_Lines On + Ignore_Older 5m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25226 + Chunk_Size 32 + Buffer_Size 64 [FILTER] Name grep @@ -41,4 +52,8 @@ Name oms EnableTelemetry true TelemetryPushIntervalSeconds 300 - Match oms.container.log.* \ No newline at end of file + Match oms.container.* + +#[OUTPUT] +# Name file +# Match oms.container.perf.telegraf.* \ No newline at end of file diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 99b271ac4..ccd53bc03 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -77,9 +77,9 @@ ## Logging configuration: ## Run telegraf with debug log messages. - debug = false + debug = true ## Run telegraf in quiet mode (error log messages only). - quiet = true + quiet = false ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" @@ -94,29 +94,29 @@ ############################################################################### # Send aggregate metrics to Azure Monitor -[[outputs.azure_monitor]] +#[[outputs.azure_monitor]] ## Timeout for HTTP writes. # timeout = "20s" ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "Insights.Container/" +# namespace_prefix = "Insights.Container/" ## Azure Monitor doesn't have a string value type, so convert string ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows ## a maximum of 10 dimensions so Telegraf will only send the first 10 ## alphanumeric dimensions. - strings_as_dimensions = true +# strings_as_dimensions = true ## Both region and resource_id must be set or be available via the ## Instance Metadata service on Azure Virtual Machines. # ## Azure Region to publish metrics against. ## ex: region = "southcentralus" - region = "placeholder_region" +# region = "placeholder_region" # ## The Azure Resource ID against which metric will be logged, e.g. #resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" - resource_id = "placeholder_resource_id" +# resource_id = "placeholder_resource_id" #azure_tenant_id = "placeholder_azure_tenant_id" @@ -125,6 +125,41 @@ #azure_client_secret = "placeholder_azure_client_secret" #namepass = ["nodes", "pods", "containers","prometheus"] +# namedrop = ["filestat", "telegraf_telemetry"] +# tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25226" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" namedrop = ["filestat", "telegraf_telemetry"] tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index d58a33b55..c97da3963 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -25,6 +25,11 @@ import ( // DataType for Container Log const DataType = "CONTAINER_LOG_BLOB" +//env varibale which has ResourceId for LA +const ResourceIdEnv = "AKS_RESOURCE_ID" + +const CustomLogsAPIVersion = "api-version=2016-04-01" + // ContainerLogPluginConfFilePath --> config file path for container log plugin const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" @@ -41,6 +46,8 @@ var ( HTTPClient http.Client // OMSEndpoint ingestion endpoint OMSEndpoint string + // Custom log ingestion endpoint for OMS + OMSCustomLogsEndpoint string // Computer (Hostname) when ingesting into ContainerLog table Computer string // WorkspaceID log analytics workspace id @@ -60,6 +67,8 @@ var ( ContainerLogTelemetryMutex = &sync.Mutex{} // ClientSet for querying KubeAPIs ClientSet *kubernetes.Clientset + //ResourceId for LA + ResourceId string ) var ( @@ -89,6 +98,18 @@ type DataItem struct { Computer string `json:"Computer"` } +// telegraf metric DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin +type laTelegrafMetric struct { + Namespace string `json:"Namespace"` + Name string `json:"Name"` + Source string `json:"Source"` + TimeStamp string `json:"TimeStamp"` + Tags string `json:"Tags"` + Value float64 `json:"Value"` + ResourceId string `json:"ResourceId"` + MetricType string `json:"MetricType"` +} + // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type ContainerLogBlob struct { DataType string `json:"DataType"` @@ -201,6 +222,147 @@ func updateKubeSystemContainerIDs() { } } +//Azure loganalytics metric values have to be numeric, so string values are dropped +func convert(in interface{}) (float64, bool) { + Log ("got %v", in) + switch v := in.(type) { + case int64: + return float64(v), true + case uint64: + return float64(v), true + case float64: + return v, true + case bool: + if v { + return float64(1), true + } + return float64(0), true + default: + Log ("returning 0 for %v ", in) + return float64(0), false + } +} + +//Translates telegraf time series to 1 or more Azure loganalytics metric +func translate(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { + + var laMetrics []*laTelegrafMetric + var tags map[interface{}]interface{} + tags = m["tags"].(map[interface{}]interface{}) + tagMap := make(map[string]string) + for k, v := range tags { + key := fmt.Sprintf("%s",k) + if key == "" { + continue + } + tagMap[key] = fmt.Sprintf("%s",v) + } + + var fieldMap map[interface{}]interface{} + fieldMap = m["fields"].(map[interface{}]interface{}) + + var metricType string = "unknown" + + tagJson, _ := json.Marshal(&tagMap) + + for k, v := range fieldMap { + fv, ok := convert(v) + if !ok { + continue + } + i := m["timestamp"].(uint64) + laMetric := laTelegrafMetric{ + Name: fmt.Sprintf("%s",k), + Namespace: fmt.Sprintf("%s",m["name"]), + Source: "telegraf", + TimeStamp: time.Unix(int64(i),0).Format(time.RFC3339), + Tags: fmt.Sprintf("%s", tagJson), + Value: fv, + ResourceId: ResourceId, + MetricType: metricType, + } + + //Log ("la metric:%v", laMetric) + laMetrics = append(laMetrics, &laMetric) + } + return laMetrics, nil +} + +//send metrics from Telegraf to LA +func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int { + var laMetrics []*laTelegrafMetric + for _, record := range telegrafRecords { + //Log ("mymetric:%s", record) + translatedMetrics, err := translate(record) + if err != nil { + Log("PostTelegrafMetricsToLA::Error when translating telegraf metric to log analytics metric %q", err) + } + laMetrics = append(laMetrics, translatedMetrics...) + } + + jsonBytes, err := json.Marshal(&laMetrics) + if err != nil { + Log("PostTelegrafMetricsToLA::Error when marshalling json %q", err) + //SendException(message) + return output.FLB_OK + } + + Log ("got %s metrics", len(laMetrics)) + + //start + req, _ := http.NewRequest("POST", OMSCustomLogsEndpoint, bytes.NewBuffer(jsonBytes)) + + //req.URL.Query().Add("api-version","2016-04-01") + + req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Log-Type", "telegrafMetricsV1"); + req.Header.Set("time-generated-field", "timestamp"); + req.Header.Set("x-ms-AzureResourceId", ResourceId) + + start := time.Now() + resp, err := HTTPClient.Do(req) + elapsed := time.Since(start) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error when sending request %s \n", err.Error()) + Log(message) + //SendException(message) + Log("PostTelegrafMetricsToLA::Failed to flush %d records after %s", len(laMetrics), elapsed) + + return output.FLB_RETRY + } + + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("PostTelegrafMetricsToLA::Response Status %s Status Code %d", resp.Status, resp.StatusCode) + } + return output.FLB_RETRY + } + + defer resp.Body.Close() + + numRecords := len(laMetrics) + Log("PostTelegrafMetricsToLA::Successfully flushed %d records in %s", numRecords, elapsed) + //ContainerLogTelemetryMutex.Lock() + //FlushedRecordsCount += float64(numRecords) + //FlushedRecordsTimeTaken += float64(elapsed / time.Millisecond) + + //if maxLatency >= AgentLogProcessingMaxLatencyMs { + // AgentLogProcessingMaxLatencyMs = maxLatency + // AgentLogProcessingMaxLatencyMsContainer = maxLatencyContainer + //} + + //ContainerLogTelemetryMutex.Unlock() +//} + + return output.FLB_OK + + + //end + +} + // PostDataHelper sends data to the OMS endpoint func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { @@ -317,6 +479,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { return output.FLB_RETRY } + defer resp.Body.Close() + numRecords := len(dataItems) Log("Successfully flushed %d records in %s", numRecords, elapsed) ContainerLogTelemetryMutex.Lock() @@ -358,6 +522,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { IgnoreIDSet = make(map[string]bool) ImageIDMap = make(map[string]string) NameIDMap = make(map[string]string) + ResourceId = os.Getenv(ResourceIdEnv) pluginConfig, err := ReadConfiguration(pluginConfPath) if err != nil { @@ -377,6 +542,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { log.Fatalln(message) } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] + OMSCustomLogsEndpoint = OMSEndpoint + "?" + CustomLogsAPIVersion WorkspaceID = omsadminConf["WORKSPACE_ID"] Log("OMSEndpoint %s", OMSEndpoint) diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 90ecaf15a..dccc6774c 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -62,6 +62,8 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { incomingTag := strings.ToLower(C.GoString(tag)) if strings.Contains(incomingTag, "oms.container.log.flbplugin") { return PushToAppInsightsTraces(records, appinsights.Information, incomingTag) + } else if strings.Contains(incomingTag, "oms.container.perf.telegraf") { + return PostTelegrafMetricsToLA(records) } else if strings.Contains(incomingTag, "oms.container.log.telegraf.err") { return PushToAppInsightsTraces(records, appinsights.Error, incomingTag) } From 1c4c714e553cfe1658210dda47048945cc05465e Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Tue, 19 Mar 2019 18:00:35 -0700 Subject: [PATCH 34/38] telemetry changes for telegraf --- installer/conf/telegraf-rs.conf | 10 +-- installer/conf/telegraf.conf | 10 +-- installer/scripts/Telegraf403Telemetry.sh | 4 +- source/code/go/src/plugins/oms.go | 85 +++++++++++++---------- source/code/go/src/plugins/telemetry.go | 15 +++- 5 files changed, 67 insertions(+), 57 deletions(-) diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf index fea446345..fd430e6b9 100644 --- a/installer/conf/telegraf-rs.conf +++ b/installer/conf/telegraf-rs.conf @@ -142,7 +142,7 @@ # [outputs.application_insights.context_tag_sources] # "ai.cloud.role" = "kubernetes_container_name" # "ai.cloud.roleInstance" = "kubernetes_pod_name" - namepass = ["filestat", "telegraf_telemetry"] + namepass = ["telegraf_telemetry"] ############################################################################### # PROCESSOR PLUGINS # @@ -562,14 +562,6 @@ ## Use TLS but skip chain & host verification # insecure_skip_verify = true # Read stats about given file(s) -[[inputs.filestat]] - ## Files to gather stats about. - interval = "15m" - ## These accept standard unix glob matching rules, but with the addition of - ## ** as a "super asterisk". See https://github.com/gobwas/glob. - files = ["/var/opt/microsoft/docker-cimprov/log/telegraf.log"] - ## If true, read the entire file and calculate an md5 checksum. - md5 = false [[inputs.kube_inventory]] ## URL for the Kubernetes API #url = "https://127.0.0.1" diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index ccd53bc03..dd86a50a9 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -180,7 +180,7 @@ # [outputs.application_insights.context_tag_sources] # "ai.cloud.role" = "kubernetes_container_name" # "ai.cloud.roleInstance" = "kubernetes_pod_name" - namepass = ["filestat", "telegraf_telemetry"] + namepass = ["telegraf_telemetry"] tagdrop = ["nodeName"] ############################################################################### @@ -651,14 +651,6 @@ fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] taginclude = ["nodeName"] -[[inputs.filestat]] - ## Files to gather stats about. - interval = "15m" - ## These accept standard unix glob matching rules, but with the addition of - ## ** as a "super asterisk". See https://github.com/gobwas/glob. - files = ["/var/opt/microsoft/docker-cimprov/log/telegraf.log"] - ## If true, read the entire file and calculate an md5 checksum. - md5 = false [[inputs.exec]] ## Commands array interval = "15m" diff --git a/installer/scripts/Telegraf403Telemetry.sh b/installer/scripts/Telegraf403Telemetry.sh index 3022ceaf0..c64369798 100644 --- a/installer/scripts/Telegraf403Telemetry.sh +++ b/installer/scripts/Telegraf403Telemetry.sh @@ -1,3 +1,3 @@ #!/bin/sh -count403=$(grep -iF "[azure_monitor]: failed to write batch: [403] 403 Forbidden" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') -echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID} 403countTotal=${count403}i" \ No newline at end of file +countErr=$(grep -iF "Error writing to output [socket_writer]" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') +echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID} telegrafTCPWriteErrorCountTotal=${countErr}i" \ No newline at end of file diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index c97da3963..a18a9b37a 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -224,7 +224,6 @@ func updateKubeSystemContainerIDs() { //Azure loganalytics metric values have to be numeric, so string values are dropped func convert(in interface{}) (float64, bool) { - Log ("got %v", in) switch v := in.(type) { case int64: return float64(v), true @@ -244,7 +243,7 @@ func convert(in interface{}) (float64, bool) { } //Translates telegraf time series to 1 or more Azure loganalytics metric -func translate(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { +func translateTelegrafMetrics(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { var laMetrics []*laTelegrafMetric var tags map[interface{}]interface{} @@ -263,7 +262,11 @@ func translate(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { var metricType string = "unknown" - tagJson, _ := json.Marshal(&tagMap) + tagJson, err := json.Marshal(&tagMap) + + if err != nil { + return nil, err + } for k, v := range fieldMap { fv, ok := convert(v) @@ -288,32 +291,49 @@ func translate(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { return laMetrics, nil } -//send metrics from Telegraf to LA +//send metrics from Telegraf to LA. 1) Translate telegraf timeseries to LA metric(s) 2) Send it to LA as 'ContainerMetrics' fixed type func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int { var laMetrics []*laTelegrafMetric + + if ( (telegrafRecords== nil) || ! (len(telegrafRecords) > 0) ) { + Log("PostTelegrafMetricsToLA::Error:no timeseries to derive") + return output.FLB_OK + } + for _, record := range telegrafRecords { - //Log ("mymetric:%s", record) - translatedMetrics, err := translate(record) + translatedMetrics, err := translateTelegrafMetrics(record) if err != nil { - Log("PostTelegrafMetricsToLA::Error when translating telegraf metric to log analytics metric %q", err) + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when translating telegraf metric to log analytics metric %q", err) + Log(message) + //SendException(message) //This will be too noisy } laMetrics = append(laMetrics, translatedMetrics...) } - jsonBytes, err := json.Marshal(&laMetrics) - if err != nil { - Log("PostTelegrafMetricsToLA::Error when marshalling json %q", err) - //SendException(message) + if ( (laMetrics == nil) || !(len(laMetrics) > 0) ) { + Log("PostTelegrafMetricsToLA::Info:no metrics derived from timeseries data") return output.FLB_OK + } else { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Info:derived %v metrics from %v timeseries", len(laMetrics), len(telegrafRecords)) + Log(message) } - Log ("got %s metrics", len(laMetrics)) - //start - req, _ := http.NewRequest("POST", OMSCustomLogsEndpoint, bytes.NewBuffer(jsonBytes)) + jsonBytes, err := json.Marshal(&laMetrics) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } + //Post metrics data to LA + req, _ := http.NewRequest("POST", OMSCustomLogsEndpoint, bytes.NewBuffer(jsonBytes)) + //req.URL.Query().Add("api-version","2016-04-01") - + + //set headers req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) req.Header.Set("Content-Type", "application/json") req.Header.Set("Log-Type", "telegrafMetricsV1"); @@ -325,42 +345,35 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int elapsed := time.Since(start) if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error when sending request %s \n", err.Error()) + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) Log(message) - //SendException(message) - Log("PostTelegrafMetricsToLA::Failed to flush %d records after %s", len(laMetrics), elapsed) - + SendException(message) + UpdateNumTelegrafMetricsSentTelemetry(0, 1) return output.FLB_RETRY } if resp == nil || resp.StatusCode != 200 { if resp != nil { - Log("PostTelegrafMetricsToLA::Response Status %s Status Code %d", resp.Status, resp.StatusCode) + Log("PostTelegrafMetricsToLA::Error:(retriable) Response Status %v Status Code %v", resp.Status, resp.StatusCode) } + UpdateNumTelegrafMetricsSentTelemetry(0, 1) return output.FLB_RETRY } defer resp.Body.Close() - numRecords := len(laMetrics) - Log("PostTelegrafMetricsToLA::Successfully flushed %d records in %s", numRecords, elapsed) - //ContainerLogTelemetryMutex.Lock() - //FlushedRecordsCount += float64(numRecords) - //FlushedRecordsTimeTaken += float64(elapsed / time.Millisecond) - - //if maxLatency >= AgentLogProcessingMaxLatencyMs { - // AgentLogProcessingMaxLatencyMs = maxLatency - // AgentLogProcessingMaxLatencyMsContainer = maxLatencyContainer - //} - - //ContainerLogTelemetryMutex.Unlock() -//} + numMetrics := len(laMetrics) + UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0) + Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) return output.FLB_OK +} - - //end - +func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int) { + ContainerLogTelemetryMutex.Lock() + TelegrafMetricsSentCount += float64(numMetricsSent) + TelegrafMetricsSendErrorCount += float64(numSendErrors) + ContainerLogTelemetryMutex.Unlock() } // PostDataHelper sends data to the OMS endpoint diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index bbc7be5eb..370fb63e9 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -14,7 +14,7 @@ import ( ) var ( - // FlushedRecordsCount indicates the number of flushed records in the current period + // FlushedRecordsCount indicates the number of flushed log records in the current period FlushedRecordsCount float64 // FlushedRecordsTimeTaken indicates the cumulative time taken to flush the records for the current period FlushedRecordsTimeTaken float64 @@ -28,6 +28,10 @@ var ( TelemetryClient appinsights.TelemetryClient // ContainerLogTelemetryTicker sends telemetry periodically ContainerLogTelemetryTicker *time.Ticker + //Tracks the number of telegraf metrics sent successfully between telemetry ticker periods (uses ContainerLogTelemetryTicker) + TelegrafMetricsSentCount float64 + //Tracks the number of send errors between telemetry ticker periods (uses ContainerLogTelemetryTicker) + TelegrafMetricsSendErrorCount float64 ) const ( @@ -39,6 +43,8 @@ const ( metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" defaultTelemetryPushIntervalSeconds = 300 @@ -62,9 +68,14 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { for ; true; <-ContainerLogTelemetryTicker.C { SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) elapsed := time.Since(start) + ContainerLogTelemetryMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 logRate := FlushedRecordsCount / float64(elapsed/time.Second) + telegrafMetricsSentCount := TelegrafMetricsSentCount + telegrafMetricsSendErrorCount := TelegrafMetricsSendErrorCount + TelegrafMetricsSentCount = 0.0 + TelegrafMetricsSendErrorCount = 0.0 FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 logLatencyMs := AgentLogProcessingMaxLatencyMs @@ -80,6 +91,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) logLatencyMetric.Properties["Container"] = logLatencyMsContainer TelemetryClient.Track(logLatencyMetric) + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofSendErrorsTelegrafMetrics, telegrafMetricsSendErrorCount)) start = time.Now() } } From f1325de25dd3cebdc4cb5d2a2e81fcc77a1f76cd Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 21 Mar 2019 16:38:51 -0700 Subject: [PATCH 35/38] telemetry & other changes --- installer/conf/telegraf-rs.conf | 618 ------------------ installer/conf/telegraf.conf | 348 +++------- installer/datafiles/base_container.data | 3 +- installer/scripts/Telegraf403Telemetry.sh | 3 - .../scripts/TelegrafTCPErrorTelemetry.sh | 3 + 5 files changed, 99 insertions(+), 876 deletions(-) delete mode 100644 installer/conf/telegraf-rs.conf delete mode 100644 installer/scripts/Telegraf403Telemetry.sh create mode 100644 installer/scripts/TelegrafTCPErrorTelemetry.sh diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf deleted file mode 100644 index fd430e6b9..000000000 --- a/installer/conf/telegraf-rs.conf +++ /dev/null @@ -1,618 +0,0 @@ -# Telegraf Configuration -# -# Telegraf is entirely plugin driven. All metrics are gathered from the -# declared inputs, and sent to the declared outputs. -# -# Plugins must be declared in here to be active. -# To deactivate a plugin, comment out the name and any variables. -# -# Use 'telegraf -config telegraf.conf -test' to see what metrics a config -# file would generate. -# -# Environment variables can be used anywhere in this config file, simply prepend -# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), -# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) - - -# Global tags can be specified here in key="value" format. -[global_tags] - # dc = "us-east-1" # will tag all metrics with dc=us-east-1 - # rack = "1a" - ## Environment variables can be used as tags, and throughout the config file - # user = "$USER" - # cluster = "$ACS_RESOURCE_NAME" - #node = $NODE_IP - AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$AKS_RESOURCE_ID" - Region = "$AKS_REGION" - ClusterName = "$AKS_CLUSTER_NAME" - ClusterType = "AKS" - Computer = "placeholder_hostname" - ControllerType = "$CONTROLLER_TYPE" - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = true - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## For failed writes, telegraf will cache metric_buffer_limit metrics for each - ## output, and will flush this buffer on a successful write. Oldest metrics - ## are dropped first when this buffer fills. - ## This buffer only fills when writes fail to output plugin(s). - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "0s" - - ## Default flushing interval for all outputs. You shouldn't set this below - ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "10s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "0s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Logging configuration: - ## Run telegraf with debug log messages. - debug = false - ## Run telegraf in quiet mode (error log messages only). - quiet = true - ## Specify the log file name. The empty string means to log to stderr. - logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" - - ## Override default hostname, if empty use os.Hostname() - #hostname = "placeholder_hostname" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = true - - -############################################################################### -# OUTPUT PLUGINS # -############################################################################### - -# Send aggregate metrics to Azure Monitor -[[outputs.azure_monitor]] - ## Timeout for HTTP writes. - # timeout = "20s" - - ## Set the namespace prefix, defaults to "Telegraf/". - namespace_prefix = "Insights.Container/" - - ## Azure Monitor doesn't have a string value type, so convert string - ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows - ## a maximum of 10 dimensions so Telegraf will only send the first 10 - ## alphanumeric dimensions. - strings_as_dimensions = true - - ## Both region and resource_id must be set or be available via the - ## Instance Metadata service on Azure Virtual Machines. - # - ## Azure Region to publish metrics against. - ## ex: region = "southcentralus" - region = "placeholder_region" - # - ## The Azure Resource ID against which metric will be logged, e.g. - #resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" - resource_id = "placeholder_resource_id" - - #azure_tenant_id = "placeholder_azure_tenant_id" - - #azure_client_id = "placeholder_azure_client_id" - - #azure_client_secret = "placeholder_azure_client_secret" - - #namepass = ["nodes", "pods", "containers","prometheus"] - namedrop = ["filestat", "telegraf_telemetry"] - tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] - -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$APPLICATIONINSIGHTS_KEY" - - ## Timeout for closing (default: 5s). - # timeout = "5s" - - ## Enable additional diagnostic logging. - # enable_diagnostic_logging = false - - ## Context Tag Sources add Application Insights context tags to a tag value. - ## - ## For list of allowed context tag keys see: - ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go - # [outputs.application_insights.context_tag_sources] - # "ai.cloud.role" = "kubernetes_container_name" - # "ai.cloud.roleInstance" = "kubernetes_pod_name" - namepass = ["telegraf_telemetry"] - -############################################################################### -# PROCESSOR PLUGINS # -############################################################################### - -# # Convert values to another metric value type -# [[processors.converter]] -# ## Tags to convert -# ## -# ## The table key determines the target type, and the array of key-values -# ## select the keys to convert. The array may contain globs. -# ## = [...] -# [processors.converter.tags] -# string = ["device"] -# integer = [] -# unsigned = [] -# boolean = [] -# float = [] -# -# ## Fields to convert -# ## -# ## The table key determines the target type, and the array of key-values -# ## select the keys to convert. The array may contain globs. -# ## = [...] -# [processors.converter.fields] -# tag = ["host"] -# string = [] -# integer = [] -# unsigned = [] -# boolean = [] -# float = [] - - -# # Map enum values according to given table. -# [[processors.enum]] -# [[processors.enum.mapping]] -# ## Name of the field to map -# field = "status" -# -# ## Destination field to be used for the mapped value. By default the source -# ## field is used, overwriting the original value. -# # dest = "status_code" -# -# ## Default value to be used for all values not contained in the mapping -# ## table. When unset, the unmodified value for the field will be used if no -# ## match is found. -# # default = 0 -# -# ## Table of mappings -# [processors.enum.mapping.value_mappings] -# green = 1 -# yellow = 2 -# red = 3 - - -# # Apply metric modifications using override semantics. -# [[processors.override]] -# ## All modifications on inputs and aggregators can be overridden: -# # name_override = "new_name" -# # name_prefix = "new_name_prefix" -# # name_suffix = "new_name_suffix" -# -# ## Tags to be added (all values must be strings) -# # [processors.override.tags] -# # additional_tag = "tag_value" - - -# # Parse a value in a specified field/tag(s) and add the result in a new metric -# [[processors.parser]] -# ## The name of the fields whose value will be parsed. -# parse_fields = [] -# -# ## If true, incoming metrics are not emitted. -# drop_original = false -# -# ## If set to override, emitted metrics will be merged by overriding the -# ## original metric using the newly parsed metrics. -# merge = "override" -# -# ## The dataformat to be read from files -# ## Each data format has its own unique set of configuration options, read -# ## more about them here: -# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md -# data_format = "influx" - - -# # Print all metrics that pass through this filter. -# [[processors.printer]] - - -# # Transforms tag and field values with regex pattern -# [[processors.regex]] -# ## Tag and field conversions defined in a separate sub-tables -# # [[processors.regex.tags]] -# # ## Tag to change -# # key = "resp_code" -# # ## Regular expression to match on a tag value -# # pattern = "^(\\d)\\d\\d$" -# # ## Pattern for constructing a new value (${1} represents first subgroup) -# # replacement = "${1}xx" -# -# # [[processors.regex.fields]] -# # key = "request" -# # ## All the power of the Go regular expressions available here -# # ## For example, named subgroups -# # pattern = "^/api(?P/[\\w/]+)\\S*" -# # replacement = "${method}" -# # ## If result_key is present, a new field will be created -# # ## instead of changing existing field -# # result_key = "method" -# -# ## Multiple conversions may be applied for one field sequentially -# ## Let's extract one more value -# # [[processors.regex.fields]] -# # key = "request" -# # pattern = ".*category=(\\w+).*" -# # replacement = "${1}" -# # result_key = "search_category" - - -# # Rename measurements, tags, and fields that pass through this filter. -# [[processors.rename]] - - -# # Perform string processing on tags, fields, and measurements -[[processors.rename]] - [[processors.rename.replace]] - measurement = "kubernetes_daemonset" - dest = "daemonsets" - [[processors.rename.replace]] - measurement = "kubernetes_deployment" - dest = "deployments" - [[processors.rename.replace]] - measurement = "kubernetes_statefulset" - dest = "statefulsets" - [[processors.rename.replace]] - measurement = "kubernetes_node" - dest = "nodes" - [[processors.rename.replace]] - measurement = "kubernetes_pod_container" - dest = "containers" - [[processors.rename.replace]] - field = "current_number_scheduled" - dest = "currentNumberScheduled" - [[processors.rename.replace]] - field = "desired_number_scheduled" - dest = "desiredNumberScheduled" - [[processors.rename.replace]] - field = "number_available" - dest = "numberAvailable" - [[processors.rename.replace]] - field = "number_unavailable" - dest = "numUnavailable" - [[processors.rename.replace]] - field = "number_ready" - dest = "numReady" - [[processors.rename.replace]] - field = "replicas_available" - dest = "numReplicasAvailable" - [[processors.rename.replace]] - field = "replicas_unavailable" - dest = "numReplicasUnavailable" - [[processors.rename.replace]] - field = "capacity_cpu_cores" - dest = "capacityCpuCores" - [[processors.rename.replace]] - field = "capacity_memory_bytes" - dest = "capacityMemoryBytes" - [[processors.rename.replace]] - field = "capacity_pods" - dest = "capacityNumPods" - [[processors.rename.replace]] - field = "allocatable_pods" - dest = "allocatableNumPods" - [[processors.rename.replace]] - field = "allocatable_cpu_cores" - dest = "allocatableCpuCores" - [[processors.rename.replace]] - field = "allocatable_memory_bytes" - dest = "allocatableMemoryBytes" - [[processors.rename.replace]] - field = "restarts_total" - dest = "restartsTotal" - [[processors.rename.replace]] - field = "resource_requests_cpu_units" - dest = "resourceRequestsCpuUnits" - [[processors.rename.replace]] - field = "resource_requests_memory_bytes" - dest = "resourceRequestsMemoryBytes" - [[processors.rename.replace]] - field = "resource_limits_cpu_units" - dest = "resourceLimitsCpuUnits" - [[processors.rename.replace]] - field = "resource_limits_memory_bytes" - dest = "resourceLimitsMemoryBytes" - [[processors.rename.replace]] - field = "spec_replicas" - dest = "numSpecReplicas" - [[processors.rename.replace]] - field = "replicas_current" - dest = "numCurrentReplicas" - [[processors.rename.replace]] - field = "replicas_ready" - dest = "numReadyReplicas" - [[processors.rename.replace]] - tag = "daemonset_name" - dest = "daemonsetName" - [[processors.rename.replace]] - tag = "deployment_name" - dest = "deploymentName" - [[processors.rename.replace]] - tag = "container_name" - dest = "containerName" - [[processors.rename.replace]] - tag = "statefulset_name" - dest = "statefulsetName" - [[processors.rename.replace]] - tag = "node_name" - dest = "nodeName" - -# ## Convert a tag value to uppercase -# # [[processors.strings.uppercase]] -# # tag = "method" -# -# ## Convert a field value to lowercase and store in a new field -# # [[processors.strings.lowercase]] -# # field = "uri_stem" -# # dest = "uri_stem_normalised" -# -# ## Trim leading and trailing whitespace using the default cutset -# # [[processors.strings.trim]] -# # field = "message" -# -# ## Trim leading characters in cutset -# # [[processors.strings.trim_left]] -# # field = "message" -# # cutset = "\t" -# -# ## Trim trailing characters in cutset -# # [[processors.strings.trim_right]] -# # field = "message" -# # cutset = "\r\n" -# -# ## Trim the given prefix from the field -# # [[processors.strings.trim_prefix]] -# # field = "my_value" -# # prefix = "my_" -# -# ## Trim the given suffix from the field -# # [[processors.strings.trim_suffix]] -# # field = "read_count" -# # suffix = "_count" - - -# # Print all metrics that pass through this filter. -# [[processors.topk]] -# ## How many seconds between aggregations -# # period = 10 -# -# ## How many top metrics to return -# # k = 10 -# -# ## Over which tags should the aggregation be done. Globs can be specified, in -# ## which case any tag matching the glob will aggregated over. If set to an -# ## empty list is no aggregation over tags is done -# # group_by = ['*'] -# -# ## Over which fields are the top k are calculated -# # fields = ["value"] -# -# ## What aggregation to use. Options: sum, mean, min, max -# # aggregation = "mean" -# -# ## Instead of the top k largest metrics, return the bottom k lowest metrics -# # bottomk = false -# -# ## The plugin assigns each metric a GroupBy tag generated from its name and -# ## tags. If this setting is different than "" the plugin will add a -# ## tag (which name will be the value of this setting) to each metric with -# ## the value of the calculated GroupBy tag. Useful for debugging -# # add_groupby_tag = "" -# -# ## These settings provide a way to know the position of each metric in -# ## the top k. The 'add_rank_field' setting allows to specify for which -# ## fields the position is required. If the list is non empty, then a field -# ## will be added to each and every metric for each string present in this -# ## setting. This field will contain the ranking of the group that -# ## the metric belonged to when aggregated over that field. -# ## The name of the field will be set to the name of the aggregation field, -# ## suffixed with the string '_topk_rank' -# # add_rank_fields = [] -# -# ## These settings provide a way to know what values the plugin is generating -# ## when aggregating metrics. The 'add_agregate_field' setting allows to -# ## specify for which fields the final aggregation value is required. If the -# ## list is non empty, then a field will be added to each every metric for -# ## each field present in this setting. This field will contain -# ## the computed aggregation for the group that the metric belonged to when -# ## aggregated over that field. -# ## The name of the field will be set to the name of the aggregation field, -# ## suffixed with the string '_topk_aggregate' -# # add_aggregate_fields = [] - - - -############################################################################### -# AGGREGATOR PLUGINS # -############################################################################### - -# # Keep the aggregate basicstats of each metric passing through. -# [[aggregators.basicstats]] -# ## General Aggregator Arguments: -# ## The period on which to flush & clear the aggregator. -# period = "30s" -# ## If true, the original metric will be dropped by the -# ## aggregator and will not get sent to the output plugins. -# drop_original = false - - -# # Create aggregate histograms. -# [[aggregators.histogram]] -# ## The period in which to flush the aggregator. -# period = "30s" -# -# ## If true, the original metric will be dropped by the -# ## aggregator and will not get sent to the output plugins. -# drop_original = false -# -# ## Example config that aggregates all fields of the metric. -# # [[aggregators.histogram.config]] -# # ## The set of buckets. -# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] -# # ## The name of metric. -# # measurement_name = "cpu" -# -# ## Example config that aggregates only specific fields of the metric. -# # [[aggregators.histogram.config]] -# # ## The set of buckets. -# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] -# # ## The name of metric. -# # measurement_name = "diskio" -# # ## The concrete fields of metric -# # fields = ["io_time", "read_time", "write_time"] - - -# # Keep the aggregate min/max of each metric passing through. -# [[aggregators.minmax]] -# ## General Aggregator Arguments: -# ## The period on which to flush & clear the aggregator. -# period = "30s" -# ## If true, the original metric will be dropped by the -# ## aggregator and will not get sent to the output plugins. -# drop_original = false - - -# # Count the occurance of values in fields. -# [[aggregators.valuecounter]] -# ## General Aggregator Arguments: -# ## The period on which to flush & clear the aggregator. -# period = "30s" -# ## If true, the original metric will be dropped by the -# ## aggregator and will not get sent to the output plugins. -# drop_original = false -# ## The fields for which the values will be counted -# fields = [] - - - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -# Read metrics about cpu usage -#[[inputs.cpu]] - ## Whether to report per-cpu stats or not -# percpu = false - ## Whether to report total system cpu stats or not -# totalcpu = true - ## If true, collect raw CPU time metrics. -# collect_cpu_time = false - ## If true, compute and report the sum of all non-idle CPU states. -# report_active = true -# fieldpass = ["usage_active","cluster","node","host","device"] -# taginclude = ["cluster","cpu","node"] - - - - # Read metrics from one or many prometheus clients -#[[inputs.prometheus]] - ## An array of urls to scrape metrics from. -# urls = ["https://$METRICS_SERVER_SERVICE_HOST/metrics"] - - ## An array of Kubernetes services to scrape metrics from. - # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] - - ## Kubernetes config file to create client from. - # kube_config = "/path/to/kubernetes.config" - - ## Scrape Kubernetes pods for the following prometheus annotations: - ## - prometheus.io/scrape: Enable scraping for this pod - ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to - ## set this to 'https' & most likely set the tls config. - ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. - ## - prometheus.io/port: If port is not 9102 use this annotation - # monitor_kubernetes_pods = true - - ## Use bearer token for authorization -# bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" - - ## Specify timeout duration for slower prometheus clients (default is 3s) -# response_timeout = "15s" - - ## Optional TLS Config -# tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - # tls_cert = /path/to/certfile - # tls_key = /path/to/keyfile - ## Use TLS but skip chain & host verification -# insecure_skip_verify = true - # Read stats about given file(s) -[[inputs.kube_inventory]] - ## URL for the Kubernetes API - #url = "https://127.0.0.1" - url = "$K8SSERVICEHOST" - - ## Namespace to use - namespace = "" - - ## Use bearer token for authorization. ('bearer_token' takes priority) - bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" - ## OR - # bearer_token_string = "abc_123" - - ## Set response_timeout (default 5 seconds) - response_timeout = "15s" - - ## Optional Resources to exclude from gathering - ## Leave them with blank with try to gather everything available. - ## Values can be - "daemonsets", deployments", "nodes", "persistentvolumes", - ## "persistentvolumeclaims", "pods", "statefulsets" - # resource_exclude = [ "deployments", "nodes", "statefulsets" ] - - ## Optional Resources to include when gathering - ## Overrides resource_exclude if both set. - # resource_include = [ "deployments", "nodes", "statefulsets" ] - - ## Optional TLS Config - tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - # tls_cert = "/path/to/certfile" - # tls_key = "/path/to/keyfile" - ## Use TLS but skip chain & host verification - insecure_skip_verify = true - - namepass = ["kubernetes_daemonset", "kubernetes_deployment", "kubernetes_node", "kubernetes_pod_container", "kubernetes_statefulset"] - fieldpass = ["current_number_scheduled", "desired_number_scheduled", "number_available", "number_unavailable", "number_ready", "replicas_available", "replicas_unavailable", "capacity_cpu_cores", "capacity_memory_bytes", "capacity_pods", "allocatable_pods", "allocatable_cpu_cores", "allocatable_memory_bytes", "restarts_total","resource_requests_cpu_units", "resource_requests_memory_bytes", "resource_limits_cpu_units", "resource_limits_memory_bytes" , "spec_replicas", "replicas_current", "replicas_ready"] - taginclude = ["nodeName", "daemonset_name", "namespace", "deployment_name", "container_name", "namespace", "node_name","statefulset_name"] -[[inputs.exec]] - ## Commands array - interval = "15m" - commands = [ - "/opt/microsoft/docker-cimprov/bin/Telegraf403Telemetry.sh" - ] - - ## Timeout for each command to complete. - timeout = "15s" - - ## measurement name suffix (for separating different commands) - name_suffix = "_telemetry" - - ## Data format to consume. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md - data_format = "influx" \ No newline at end of file diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index dd86a50a9..70f74093c 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -16,17 +16,13 @@ # Global tags can be specified here in key="value" format. [global_tags] - # dc = "us-east-1" # will tag all metrics with dc=us-east-1 - # rack = "1a" - ## Environment variables can be used as tags, and throughout the config file - # user = "$USER" - # cluster = "$ACS_RESOURCE_NAME" - #node = $NODE_IP + #Below are entirely used for telemetry AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$AKS_RESOURCE_ID" - Region = "$AKS_REGION" - ClusterName = "$AKS_CLUSTER_NAME" - ClusterType = "AKS" + AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + Region = "$TELEMETRY_AKS_REGION" + ClusterName = "$TELEMETRY_CLUSTER_NAME" + ClusterType = "$TELEMETRY_CLUSTER_TYPE" Computer = "placeholder_hostname" ControllerType = "$CONTROLLER_TYPE" @@ -60,7 +56,7 @@ ## Default flushing interval for all outputs. You shouldn't set this below ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "10s" + flush_interval = "60s" ## Jitter the flush interval by a random amount. This is primarily to avoid ## large write spikes for users running a large number of telegraf instances. ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s @@ -77,9 +73,9 @@ ## Logging configuration: ## Run telegraf with debug log messages. - debug = true + debug = false ## Run telegraf in quiet mode (error log messages only). - quiet = false + quiet = true ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" @@ -93,41 +89,6 @@ # OUTPUT PLUGINS # ############################################################################### -# Send aggregate metrics to Azure Monitor -#[[outputs.azure_monitor]] - ## Timeout for HTTP writes. - # timeout = "20s" - - ## Set the namespace prefix, defaults to "Telegraf/". -# namespace_prefix = "Insights.Container/" - - ## Azure Monitor doesn't have a string value type, so convert string - ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows - ## a maximum of 10 dimensions so Telegraf will only send the first 10 - ## alphanumeric dimensions. -# strings_as_dimensions = true - - ## Both region and resource_id must be set or be available via the - ## Instance Metadata service on Azure Virtual Machines. - # - ## Azure Region to publish metrics against. - ## ex: region = "southcentralus" -# region = "placeholder_region" - # - ## The Azure Resource ID against which metric will be logged, e.g. - #resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" -# resource_id = "placeholder_resource_id" - - #azure_tenant_id = "placeholder_azure_tenant_id" - - #azure_client_id = "placeholder_azure_client_id" - - #azure_client_secret = "placeholder_azure_client_secret" - - #namepass = ["nodes", "pods", "containers","prometheus"] -# namedrop = ["filestat", "telegraf_telemetry"] -# tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] - # Generic socket writer capable of handling multiple socket types. [[outputs.socket_writer]] ## URL to connect to @@ -160,12 +121,12 @@ ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "json" - namedrop = ["filestat", "telegraf_telemetry"] - tagdrop = ["AgentVersion","AKS_RESOURCE_ID","Region","ClusterName","ClusterType", "Computer", "ControllerType"] + namedrop = ["telegraf_telemetry"] + tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] [[outputs.application_insights]] ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$APPLICATIONINSIGHTS_KEY" + instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" ## Timeout for closing (default: 5s). # timeout = "5s" @@ -187,125 +148,6 @@ # PROCESSOR PLUGINS # ############################################################################### -# # Convert values to another metric value type -# [[processors.converter]] -# ## Tags to convert -# ## -# ## The table key determines the target type, and the array of key-values -# ## select the keys to convert. The array may contain globs. -# ## = [...] -# [processors.converter.tags] -# string = ["device"] -# integer = [] -# unsigned = [] -# boolean = [] -# float = [] -# -# ## Fields to convert -# ## -# ## The table key determines the target type, and the array of key-values -# ## select the keys to convert. The array may contain globs. -# ## = [...] -# [processors.converter.fields] -# tag = ["host"] -# string = [] -# integer = [] -# unsigned = [] -# boolean = [] -# float = [] - - -# # Map enum values according to given table. -# [[processors.enum]] -# [[processors.enum.mapping]] -# ## Name of the field to map -# field = "status" -# -# ## Destination field to be used for the mapped value. By default the source -# ## field is used, overwriting the original value. -# # dest = "status_code" -# -# ## Default value to be used for all values not contained in the mapping -# ## table. When unset, the unmodified value for the field will be used if no -# ## match is found. -# # default = 0 -# -# ## Table of mappings -# [processors.enum.mapping.value_mappings] -# green = 1 -# yellow = 2 -# red = 3 - - -# # Apply metric modifications using override semantics. -# [[processors.override]] -# ## All modifications on inputs and aggregators can be overridden: -# # name_override = "new_name" -# # name_prefix = "new_name_prefix" -# # name_suffix = "new_name_suffix" -# -# ## Tags to be added (all values must be strings) -# # [processors.override.tags] -# # additional_tag = "tag_value" - - -# # Parse a value in a specified field/tag(s) and add the result in a new metric -# [[processors.parser]] -# ## The name of the fields whose value will be parsed. -# parse_fields = [] -# -# ## If true, incoming metrics are not emitted. -# drop_original = false -# -# ## If set to override, emitted metrics will be merged by overriding the -# ## original metric using the newly parsed metrics. -# merge = "override" -# -# ## The dataformat to be read from files -# ## Each data format has its own unique set of configuration options, read -# ## more about them here: -# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md -# data_format = "influx" - - -# # Print all metrics that pass through this filter. -# [[processors.printer]] - - -# # Transforms tag and field values with regex pattern -# [[processors.regex]] -# ## Tag and field conversions defined in a separate sub-tables -# # [[processors.regex.tags]] -# # ## Tag to change -# # key = "resp_code" -# # ## Regular expression to match on a tag value -# # pattern = "^(\\d)\\d\\d$" -# # ## Pattern for constructing a new value (${1} represents first subgroup) -# # replacement = "${1}xx" -# -# # [[processors.regex.fields]] -# # key = "request" -# # ## All the power of the Go regular expressions available here -# # ## For example, named subgroups -# # pattern = "^/api(?P/[\\w/]+)\\S*" -# # replacement = "${method}" -# # ## If result_key is present, a new field will be created -# # ## instead of changing existing field -# # result_key = "method" -# -# ## Multiple conversions may be applied for one field sequentially -# ## Let's extract one more value -# # [[processors.regex.fields]] -# # key = "request" -# # pattern = ".*category=(\\w+).*" -# # replacement = "${1}" -# # result_key = "search_category" - - -# # Rename measurements, tags, and fields that pass through this filter. -# [[processors.rename]] - - # # Perform string processing on tags, fields, and measurements [[processors.rename]] [[processors.rename.replace]] @@ -320,66 +162,66 @@ [[processors.rename.replace]] field = "used_percent" dest = "diskUsedPercentage" - [[processors.rename.replace]] - measurement = "net" - dest = "nodes" - [[processors.rename.replace]] - field = "bytes_recv" - dest = "networkBytesReceivedTotal" - [[processors.rename.replace]] - field = "bytes_sent" - dest = "networkBytesSentTotal" - [[processors.rename.replace]] - field = "err_in" - dest = "networkErrorsInTotal" - [[processors.rename.replace]] - field = "err_out" - dest = "networkErrorsOutTotal" - [[processors.rename.replace]] - measurement = "kubernetes_pod_volume" - dest = "pods" - [[processors.rename.replace]] - field = "used_bytes" - dest = "podVolumeUsedBytes" - [[processors.rename.replace]] - field = "available_bytes" - dest = "podVolumeAvailableBytes" - [[processors.rename.replace]] - measurement = "kubernetes_pod_network" - dest = "pods" - [[processors.rename.replace]] - field = "tx_errors" - dest = "podNetworkTxErrorsTotal" - [[processors.rename.replace]] - field = "rx_errors" - dest = "podNetworkRxErrorsTotal" - [[processors.rename.replace]] - tag = "volume_name" - dest = "volumeName" - [[processors.rename.replace]] - tag = "pod_name" - dest = "podName" - [[processors.rename.replace]] - measurement = "docker" - dest = "containers" - [[processors.rename.replace]] - measurement = "docker_container_status" - dest = "containers" - [[processors.rename.replace]] - field = "n_containers" - dest = "numContainers" - [[processors.rename.replace]] - field = "n_containers_running" - dest = "numContainersRunning" - [[processors.rename.replace]] - field = "n_containers_stopped" - dest = "numContainersStopped" - [[processors.rename.replace]] - field = "n_containers_paused" - dest = "numContainersPaused" - [[processors.rename.replace]] - field = "n_images" - dest = "numContainerImages" + #[[processors.rename.replace]] + # measurement = "net" + # dest = "nodes" + #[[processors.rename.replace]] + # field = "bytes_recv" + # dest = "networkBytesReceivedTotal" + #[[processors.rename.replace]] + # field = "bytes_sent" + # dest = "networkBytesSentTotal" + #[[processors.rename.replace]] + # field = "err_in" + # dest = "networkErrorsInTotal" + #[[processors.rename.replace]] + # field = "err_out" + # dest = "networkErrorsOutTotal" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_volume" + # dest = "pods" + #[[processors.rename.replace]] + # field = "used_bytes" + # dest = "podVolumeUsedBytes" + #[[processors.rename.replace]] + # field = "available_bytes" + # dest = "podVolumeAvailableBytes" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_network" + # dest = "pods" + #[[processors.rename.replace]] + # field = "tx_errors" + # dest = "podNetworkTxErrorsTotal" + #[[processors.rename.replace]] + # field = "rx_errors" + # dest = "podNetworkRxErrorsTotal" + #[[processors.rename.replace]] + # tag = "volume_name" + # dest = "volumeName" + #[[processors.rename.replace]] + # tag = "pod_name" + # dest = "podName" + #[[processors.rename.replace]] + # measurement = "docker" + # dest = "containers" + #[[processors.rename.replace]] + # measurement = "docker_container_status" + # dest = "containers" + #[[processors.rename.replace]] + # field = "n_containers" + # dest = "numContainers" + #[[processors.rename.replace]] + # field = "n_containers_running" + # dest = "numContainersRunning" + #[[processors.rename.replace]] + # field = "n_containers_stopped" + # dest = "numContainersStopped" + #[[processors.rename.replace]] + # field = "n_containers_paused" + # dest = "numContainersPaused" + #[[processors.rename.replace]] + # field = "n_images" + # dest = "numContainerImages" # ## Convert a tag value to uppercase # # [[processors.strings.uppercase]] @@ -567,7 +409,7 @@ # Read metrics about network interface usage -[[inputs.net]] +#[[inputs.net]] ## By default, telegraf gathers stats from any up interface (excluding loopback) ## Setting interfaces will tell it to gather these explicit interfaces, ## regardless of status. @@ -577,17 +419,17 @@ ## On linux systems telegraf also collects protocol stats. ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. ## - ignore_protocol_stats = true +# ignore_protocol_stats = true ## #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] - fieldpass = ["err_in", "err_out"] - taginclude = ["interface","nodeName"] + #fieldpass = ["err_in", "err_out"] + #taginclude = ["interface","nodeName"] # Read metrics from the kubernetes kubelet api -[[inputs.kubernetes]] +#[[inputs.kubernetes]] ## URL for the kubelet #url = "http://1.1.1.1:10255" - url = "http://placeholder_nodeip:10255" +# url = "http://placeholder_nodeip:10255" ## Use bearer token for authorization # bearer_token = /path/to/bearer/token @@ -601,46 +443,46 @@ # tls_key = /path/to/keyfile ## Use TLS but skip chain & host verification # insecure_skip_verify = false - fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] - taginclude = ["volume_name","nodeName","namespace","pod_name"] +# fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] +# taginclude = ["volume_name","nodeName","namespace","pod_name"] # Read metrics about docker containers -[[inputs.docker]] +#[[inputs.docker]] ## Docker Endpoint ## To use TCP, set endpoint = "tcp://[ip]:[port]" ## To use environment variables (ie, docker-machine), set endpoint = "ENV" - endpoint = "unix:///var/run/host/docker.sock" +# endpoint = "unix:///var/run/host/docker.sock" ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) - gather_services = false +# gather_services = false ## Only collect metrics for these containers, collect all if empty - container_names = [] +# container_names = [] ## Containers to include and exclude. Globs accepted. ## Note that an empty array for both will include all containers - container_name_include = [] - container_name_exclude = [] +# container_name_include = [] +# container_name_exclude = [] ## Container states to include and exclude. Globs accepted. ## When empty only containers in the "running" state will be captured. - container_state_include = ['*'] +# container_state_include = ['*'] # container_state_exclude = [] ## Timeout for docker list, info, and stats commands - timeout = "5s" +# timeout = "5s" ## Whether to report for each container per-device blkio (8:0, 8:1...) and ## network (eth0, eth1, ...) stats or not - perdevice = true +# perdevice = true ## Whether to report for each container total blkio and network stats or not - total = true +# total = true ## Which environment variables should we use as a tag ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] ## docker labels to include and exclude as tags. Globs accepted. ## Note that an empty array for both will include all labels as tags - docker_label_include = [] - docker_label_exclude = [] +# docker_label_include = [] +# docker_label_exclude = [] ## Optional TLS Config # tls_ca = "/etc/telegraf/ca.pem" @@ -648,14 +490,14 @@ # tls_key = "/etc/telegraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = false - fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] +# fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] - taginclude = ["nodeName"] +# taginclude = ["nodeName"] [[inputs.exec]] ## Commands array interval = "15m" commands = [ - "/opt/microsoft/docker-cimprov/bin/Telegraf403Telemetry.sh" + "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" ] ## Timeout for each command to complete. diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index e6e6401d2..b11a0f2e4 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -94,9 +94,8 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms-rs.conf; installer/conf/out_oms-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/custom_metrics_regions.conf; installer/conf/custom_metrics_regions.conf; 644; root; root -/opt/microsoft/docker-cimprov/bin/Telegraf403Telemetry.sh; installer/scripts/Telegraf403Telemetry.sh; 755; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root diff --git a/installer/scripts/Telegraf403Telemetry.sh b/installer/scripts/Telegraf403Telemetry.sh deleted file mode 100644 index c64369798..000000000 --- a/installer/scripts/Telegraf403Telemetry.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -countErr=$(grep -iF "Error writing to output [socket_writer]" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') -echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID} telegrafTCPWriteErrorCountTotal=${countErr}i" \ No newline at end of file diff --git a/installer/scripts/TelegrafTCPErrorTelemetry.sh b/installer/scripts/TelegrafTCPErrorTelemetry.sh new file mode 100644 index 000000000..ee8bf74a1 --- /dev/null +++ b/installer/scripts/TelegrafTCPErrorTelemetry.sh @@ -0,0 +1,3 @@ +#!/bin/sh +countErr=$(grep -iF "[socket_writer]" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') +echo "telegraf,AKS_RESOURCE_ID=${AKS_RESOURCE_ID} telegrafTCPWriteErrorCountTotal=${countErr}i" \ No newline at end of file From 574077850e5178b738d616699052bc60d3161e03 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 21 Mar 2019 16:54:38 -0700 Subject: [PATCH 36/38] remove custom metric regions as we dont need anymore --- installer/conf/custom_metrics_regions.conf | 7 ------- installer/conf/td-agent-bit.conf | 2 +- installer/datafiles/base_container.data | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) delete mode 100644 installer/conf/custom_metrics_regions.conf diff --git a/installer/conf/custom_metrics_regions.conf b/installer/conf/custom_metrics_regions.conf deleted file mode 100644 index bf548abdd..000000000 --- a/installer/conf/custom_metrics_regions.conf +++ /dev/null @@ -1,7 +0,0 @@ -eastus -southcentralus -westcentralus -westus2 -southeastasia -northeurope -westeurope \ No newline at end of file diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 8f2e5b5cd..2d87fe136 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -46,7 +46,7 @@ [FILTER] Name grep Match oms.container.log.telegraf.err.* - Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ + #Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ [OUTPUT] Name oms diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index b11a0f2e4..89d63047a 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -94,7 +94,6 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms-rs.conf; installer/conf/out_oms-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/custom_metrics_regions.conf; installer/conf/custom_metrics_regions.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root %Links From 9d534d0dfdc18c2a3917c22741ee90b8729e629a Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 21 Mar 2019 16:58:20 -0700 Subject: [PATCH 37/38] remove un-needed files --- installer/conf/td-agent-bit-rs.conf | 26 ------------------------- installer/datafiles/base_container.data | 1 - 2 files changed, 27 deletions(-) delete mode 100644 installer/conf/td-agent-bit-rs.conf diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf deleted file mode 100644 index 7993e7528..000000000 --- a/installer/conf/td-agent-bit-rs.conf +++ /dev/null @@ -1,26 +0,0 @@ -[SERVICE] - Flush 30 - Log_Level info - Parsers_File /etc/td-agent-bit/parsers.conf - Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log - -[INPUT] - Name tail - Tag oms.container.log.telegraf.err.* - Path /var/opt/microsoft/docker-cimprov/log/telegraf.log - DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db - Mem_Buf_Limit 2m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 1h - -[FILTER] - Name grep - Match oms.container.log.telegraf.err.* - Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ - -[OUTPUT] - Name oms - EnableTelemetry false - TelemetryPushIntervalSeconds 300 - Match oms.container.log.* \ No newline at end of file diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 6e0cdde22..4ee32e580 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -96,7 +96,6 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; installer/conf/td-agent-bit-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms-rs.conf; installer/conf/out_oms-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root From bfb7331adf764ceeb054d8113bcf85699f2a8c52 Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Thu, 21 Mar 2019 16:59:31 -0700 Subject: [PATCH 38/38] fixes --- installer/conf/out_oms-rs.conf | 6 ------ installer/datafiles/base_container.data | 1 - 2 files changed, 7 deletions(-) delete mode 100644 installer/conf/out_oms-rs.conf diff --git a/installer/conf/out_oms-rs.conf b/installer/conf/out_oms-rs.conf deleted file mode 100644 index e3a32a526..000000000 --- a/installer/conf/out_oms-rs.conf +++ /dev/null @@ -1,6 +0,0 @@ -omsadmin_conf_path=/etc/opt/microsoft/omsagent/conf/omsadmin.conf -cert_file_path=/etc/opt/microsoft/omsagent/certs/oms.crt -key_file_path=/etc/opt/microsoft/omsagent/certs/oms.key -container_host_file_path=/var/opt/microsoft/docker-cimprov/state/containerhostname -container_inventory_refresh_interval=86400 -kube_system_containers_refresh_interval=86400 diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 4ee32e580..2d6fd7b01 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -97,7 +97,6 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf; installer/conf/out_oms-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root