diff --git a/installer/conf/td-agent-bit-rs.conf b/installer/conf/td-agent-bit-rs.conf new file mode 100644 index 000000000..740f8a951 --- /dev/null +++ b/installer/conf/td-agent-bit-rs.conf @@ -0,0 +1,29 @@ +[SERVICE] + Flush 30 + Log_Level info + Parsers_File /etc/td-agent-bit/parsers.conf + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log + +[INPUT] + Name tail + Tag oms.container.log.telegraf.err.* + Path /var/opt/microsoft/docker-cimprov/log/telegraf.log + DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db + Mem_Buf_Limit 2m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 5m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25226 + Chunk_Size 32 + Buffer_Size 64 + +[OUTPUT] + Name oms + EnableTelemetry true + TelemetryPushIntervalSeconds 300 + Match oms.container.* diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 88bacaca2..50967e61f 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -43,11 +43,6 @@ Chunk_Size 32 Buffer_Size 64 -[FILTER] - Name grep - Match oms.container.log.telegraf.err.* - #Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ - [OUTPUT] Name oms EnableTelemetry true diff --git a/installer/conf/telegraf-rs.conf b/installer/conf/telegraf-rs.conf new file mode 100644 index 000000000..cb9a36685 --- /dev/null +++ b/installer/conf/telegraf-rs.conf @@ -0,0 +1,567 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + #Below are entirely used for telemetry + #AgentVersion = "$AGENT_VERSION" + #AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + #ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + #Region = "$TELEMETRY_AKS_REGION" + #ClusterName = "$TELEMETRY_CLUSTER_NAME" + #ClusterType = "$TELEMETRY_CLUSTER_TYPE" + #Computer = "placeholder_hostname" + #ControllerType = "$CONTROLLER_TYPE" + + #hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "60s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25226" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["telegraf_telemetry"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +[[outputs.application_insights]] + ## Instrumentation key of the Application Insights resource. + instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + + ## Timeout for closing (default: 5s). + # timeout = "5s" + + ## Enable additional diagnostic logging. + # enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + namepass = ["telegraf_telemetry"] + #tagdrop = ["nodeName"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Perform string processing on tags, fields, and measurements +#[[processors.rename]] + #[[processors.rename.replace]] + # measurement = "disk" + # dest = "nodes" +# [[processors.rename.replace]] +# field = "free" +# dest = "freeBytes" +# [[processors.rename.replace]] +# field = "used" +# dest = "usedBytes" +# [[processors.rename.replace]] +# field = "used_percent" +# dest = "usedPercentage" + #[[processors.rename.replace]] + # measurement = "net" + # dest = "nodes" + #[[processors.rename.replace]] + # field = "bytes_recv" + # dest = "networkBytesReceivedTotal" + #[[processors.rename.replace]] + # field = "bytes_sent" + # dest = "networkBytesSentTotal" + #[[processors.rename.replace]] + # field = "err_in" + # dest = "networkErrorsInTotal" + #[[processors.rename.replace]] + # field = "err_out" + # dest = "networkErrorsOutTotal" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_volume" + # dest = "pods" + #[[processors.rename.replace]] + # field = "used_bytes" + # dest = "podVolumeUsedBytes" + #[[processors.rename.replace]] + # field = "available_bytes" + # dest = "podVolumeAvailableBytes" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_network" + # dest = "pods" + #[[processors.rename.replace]] + # field = "tx_errors" + # dest = "podNetworkTxErrorsTotal" + #[[processors.rename.replace]] + # field = "rx_errors" + # dest = "podNetworkRxErrorsTotal" + #[[processors.rename.replace]] + # tag = "volume_name" + # dest = "volumeName" + #[[processors.rename.replace]] + # tag = "pod_name" + # dest = "podName" + #[[processors.rename.replace]] + # measurement = "docker" + # dest = "containers" + #[[processors.rename.replace]] + # measurement = "docker_container_status" + # dest = "containers" + #[[processors.rename.replace]] + # field = "n_containers" + # dest = "numContainers" + #[[processors.rename.replace]] + # field = "n_containers_running" + # dest = "numContainersRunning" + #[[processors.rename.replace]] + # field = "n_containers_stopped" + # dest = "numContainersStopped" + #[[processors.rename.replace]] + # field = "n_containers_paused" + # dest = "numContainersPaused" + #[[processors.rename.replace]] + # field = "n_images" + # dest = "numContainerImages" + +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_agregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurance of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +#[[inputs.cpu]] + ## Whether to report per-cpu stats or not +# percpu = false + ## Whether to report total system cpu stats or not +# totalcpu = true + ## If true, collect raw CPU time metrics. +# collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. +# report_active = true +# fieldpass = ["usage_active","cluster","node","host","device"] +# taginclude = ["cluster","cpu","node"] + + + +# Read metrics about disk usage by mount point +#[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. +# ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] +# fieldpass = ["free", "used", "used_percent"] +# taginclude = ["device","path","hostName"] + # Below due to Bug - https://github.com/influxdata/telegraf/issues/5615 + # ORDER matters here!! - i.e the below should be the LAST modifier +# [inputs.disk.tagdrop] +# path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers"] + + +# Read metrics about memory usage +#[[inputs.mem]] +# fieldpass = ["used_percent", "cluster", "node","host","device"] +# taginclude = ["cluster","node"] + + +# Read metrics about network interface usage +#[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + # interfaces = ["eth0"] + ## + ## On linux systems telegraf also collects protocol stats. + ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. + ## +# ignore_protocol_stats = true + ## + #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + #fieldpass = ["err_in", "err_out"] + #taginclude = ["interface","nodeName"] + +# Read metrics from the kubernetes kubelet api +#[[inputs.kubernetes]] + ## URL for the kubelet + #url = "http://1.1.1.1:10255" +# url = "http://placeholder_nodeip:10255" + + ## Use bearer token for authorization + # bearer_token = /path/to/bearer/token + + ## Set response_timeout (default 5 seconds) + # response_timeout = "5s" + + ## Optional TLS Config + # tls_ca = /path/to/cafile + # tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] +# taginclude = ["volume_name","nodeName","namespace","pod_name"] +# Read metrics about docker containers +#[[inputs.docker]] + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/host/docker.sock" + + ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) +# gather_services = false + + ## Only collect metrics for these containers, collect all if empty +# container_names = [] + + ## Containers to include and exclude. Globs accepted. + ## Note that an empty array for both will include all containers +# container_name_include = [] +# container_name_exclude = [] + + ## Container states to include and exclude. Globs accepted. + ## When empty only containers in the "running" state will be captured. +# container_state_include = ['*'] + # container_state_exclude = [] + + ## Timeout for docker list, info, and stats commands +# timeout = "5s" + + ## Whether to report for each container per-device blkio (8:0, 8:1...) and + ## network (eth0, eth1, ...) stats or not +# perdevice = true + ## Whether to report for each container total blkio and network stats or not +# total = true + ## Which environment variables should we use as a tag + ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] + + ## docker labels to include and exclude as tags. Globs accepted. + ## Note that an empty array for both will include all labels as tags +# docker_label_include = [] +# docker_label_exclude = [] + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] + #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] +# taginclude = ["nodeName"] +#[[inputs.prometheus]] + ## An array of urls to scrape metrics from. +# urls = ["https://$KUBERNETES_SERVICE_HOST:$KUBERNETES_SERVICE_PORT/metrics"] +# fieldpass = ["apiserver_request_count"] + +# metric_version = 2 +# url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) +# bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) +# response_timeout = "15s" + + ## Optional TLS Config +# tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification +# insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] +# [inputs.prometheus.tagpass] + +[[inputs.exec]] + ## Commands array + interval = "15m" + commands = [ + "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" + ] + + ## Timeout for each command to complete. + timeout = "15s" + + ## measurement name suffix (for separating different commands) + name_suffix = "_telemetry" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + #tagexclude = ["hostName"] + [inputs.exec.tags] + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + Region = "$TELEMETRY_AKS_REGION" + ClusterName = "$TELEMETRY_CLUSTER_NAME" + ClusterType = "$TELEMETRY_CLUSTER_TYPE" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" + diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf index 355c88b3d..e7c0d6509 100644 --- a/installer/conf/telegraf.conf +++ b/installer/conf/telegraf.conf @@ -17,14 +17,14 @@ # Global tags can be specified here in key="value" format. [global_tags] #Below are entirely used for telemetry - AgentVersion = "$AGENT_VERSION" - AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" - ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" - Region = "$TELEMETRY_AKS_REGION" - ClusterName = "$TELEMETRY_CLUSTER_NAME" - ClusterType = "$TELEMETRY_CLUSTER_TYPE" - Computer = "placeholder_hostname" - ControllerType = "$CONTROLLER_TYPE" + #AgentVersion = "$AGENT_VERSION" + #AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + #ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + #Region = "$TELEMETRY_AKS_REGION" + #ClusterName = "$TELEMETRY_CLUSTER_NAME" + #ClusterType = "$TELEMETRY_CLUSTER_TYPE" + #Computer = "placeholder_hostname" + #ControllerType = "$CONTROLLER_TYPE" hostName = "placeholder_hostname" @@ -122,7 +122,7 @@ ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "json" namedrop = ["telegraf_telemetry"] - tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] [[outputs.application_insights]] ## Instrumentation key of the Application Insights resource. @@ -392,6 +392,7 @@ # Read metrics about disk usage by mount point [[inputs.disk]] + name_prefix="container.azm.ms/" ## By default stats will be gathered for all mount points. ## Set mount_points will restrict the stats to only the specified mount points. # mount_points = ["/"] @@ -411,9 +412,40 @@ # fieldpass = ["used_percent", "cluster", "node","host","device"] # taginclude = ["cluster","node"] +# Read metrics about disk IO by device +[[inputs.diskio]] + name_prefix="container.azm.ms/" + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + devices = ["sd[a-z][0-9]"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + # + ## On systems which support it, device metadata can be added in the form of + ## tags. + ## Currently only Linux is supported via udev properties. You can view + ## available properties for a device by running: + ## 'udevadm info -q property -n /dev/sda' + ## Note: Most, but not all, udev properties can be accessed this way. Properties + ## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH. + # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] + # + ## Using the same metadata source as device_tags, you can also customize the + ## name of the device via templates. + ## The 'name_templates' parameter is a list of templates to try and apply to + ## the device. The template may contain variables in the form of '$PROPERTY' or + ## '${PROPERTY}'. The first template which does not contain any variables not + ## present for the device is used as the device name tag. + ## The typical use case is for LVM volumes, to get the VG/LV name instead of + ## the near-meaningless DM-0 name. + # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] + fieldpass = ["reads", "read_bytes", "read_time", "writes", "write_bytes", "write_time", "io_time", "iops_in_progress"] + taginclude = ["name","hostName"] # Read metrics about network interface usage -#[[inputs.net]] +[[inputs.net]] + name_prefix="container.azm.ms/" ## By default, telegraf gathers stats from any up interface (excluding loopback) ## Setting interfaces will tell it to gather these explicit interfaces, ## regardless of status. @@ -423,11 +455,10 @@ ## On linux systems telegraf also collects protocol stats. ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. ## -# ignore_protocol_stats = true + ignore_protocol_stats = true ## - #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] - #fieldpass = ["err_in", "err_out"] - #taginclude = ["interface","nodeName"] + fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + taginclude = ["interface","hostName"] # Read metrics from the kubernetes kubelet api #[[inputs.kubernetes]] @@ -497,6 +528,45 @@ # fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] # taginclude = ["nodeName"] +[[inputs.prometheus]] + name_prefix="container.azm.ms/" + ## An array of urls to scrape metrics from. + urls = ["http://$NODE_IP:10255/metrics"] + fieldpass = ["kubelet_docker_operations", "kubelet_docker_operations_errors"] + + metric_version = 2 + url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + [[inputs.exec]] ## Commands array interval = "15m" @@ -516,4 +586,13 @@ ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "influx" tagexclude = ["hostName"] + [inputs.exec.tags] + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + Region = "$TELEMETRY_AKS_REGION" + ClusterName = "$TELEMETRY_CLUSTER_NAME" + ClusterType = "$TELEMETRY_CLUSTER_TYPE" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index 996c7501a..234785b64 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -97,8 +97,10 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; installer/conf/td-agent-bit-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; installer/conf/telegraf-rs.conf; 644; root; root /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root %Links diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 269d16111..166f427be 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -47,7 +47,7 @@ const TelegrafTagClusterID = "clusterId" // ContainerLogPluginConfFilePath --> config file path for container log plugin const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" -const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" +const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" // IPName for Container Log const IPName = "Containers" @@ -680,6 +680,10 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { PluginConfiguration = pluginConfig CreateHTTPClient() - go updateKubeSystemContainerIDs() - go updateContainerImageNameMaps() + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + go updateKubeSystemContainerIDs() + go updateContainerImageNameMaps() + } else { + Log("Running in replicaset. Disabling kube-system container cache collection & updates \n") + } } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index f507e4ab9..1e3d73fcd 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -66,9 +66,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { SendEvent(eventNameContainerLogInit, make(map[string]string)) for ; true; <-ContainerLogTelemetryTicker.C { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) elapsed := time.Since(start) - ContainerLogTelemetryMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 logRate := FlushedRecordsCount / float64(elapsed/time.Second) @@ -84,13 +82,17 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { AgentLogProcessingMaxLatencyMsContainer = "" ContainerLogTelemetryMutex.Unlock() - flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(flushRateMetric) - logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) - TelemetryClient.Track(logRateMetric) - logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) - logLatencyMetric.Properties["Container"] = logLatencyMsContainer - TelemetryClient.Track(logLatencyMetric) + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + TelemetryClient.Track(logRateMetric) + logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) + logLatencyMetric.Properties["Container"] = logLatencyMsContainer + TelemetryClient.Track(logLatencyMetric) + } + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofSendErrorsTelegrafMetrics, telegrafMetricsSendErrorCount)) start = time.Now()