diff --git a/installer/conf/kube.conf b/installer/conf/kube.conf index 454df6e91..0dfa3710e 100644 --- a/installer/conf/kube.conf +++ b/installer/conf/kube.conf @@ -47,12 +47,44 @@ log_level debug +#cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60s + log_level debug + + type filter_inventory2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope log_level info +#custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + type out_oms log_level debug @@ -168,3 +200,18 @@ max_retry_wait 9m retry_mdm_post_wait_minutes 60 + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + \ No newline at end of file diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 78a7b2dde..88bacaca2 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -23,10 +23,33 @@ Mem_Buf_Limit 2m Path_Key filepath Skip_Long_Lines On + Ignore_Older 5m + +[INPUT] + Name tail + Tag oms.container.log.telegraf.err.* + Path /var/opt/microsoft/docker-cimprov/log/telegraf.log + DB /var/opt/microsoft/docker-cimprov/state/telegraf-log-state.db + Mem_Buf_Limit 2m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 5m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25226 + Chunk_Size 32 + Buffer_Size 64 + +[FILTER] + Name grep + Match oms.container.log.telegraf.err.* + #Regex log /^(?:(?!\[azure_monitor\]: failed to write batch: \[403\] 403 Forbidden).)*$/ [OUTPUT] Name oms EnableTelemetry true TelemetryPushIntervalSeconds 300 - Match oms.container.log.* - AgentVersion ciprod03122019 \ No newline at end of file + Match oms.container.* diff --git a/installer/conf/telegraf.conf b/installer/conf/telegraf.conf new file mode 100644 index 000000000..355c88b3d --- /dev/null +++ b/installer/conf/telegraf.conf @@ -0,0 +1,519 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + #Below are entirely used for telemetry + AgentVersion = "$AGENT_VERSION" + AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" + ACS_RESOURCE_NAME = "$TELEMETRY_ACS_RESOURCE_NAME" + Region = "$TELEMETRY_AKS_REGION" + ClusterName = "$TELEMETRY_CLUSTER_NAME" + ClusterType = "$TELEMETRY_CLUSTER_TYPE" + Computer = "placeholder_hostname" + ControllerType = "$CONTROLLER_TYPE" + + hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "60s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "/var/opt/microsoft/docker-cimprov/log/telegraf.log" + + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25226" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["telegraf_telemetry"] + tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +[[outputs.application_insights]] + ## Instrumentation key of the Application Insights resource. + instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + + ## Timeout for closing (default: 5s). + # timeout = "5s" + + ## Enable additional diagnostic logging. + # enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + namepass = ["telegraf_telemetry"] + #tagdrop = ["nodeName"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Perform string processing on tags, fields, and measurements +#[[processors.rename]] + #[[processors.rename.replace]] + # measurement = "disk" + # dest = "nodes" +# [[processors.rename.replace]] +# field = "free" +# dest = "freeBytes" +# [[processors.rename.replace]] +# field = "used" +# dest = "usedBytes" +# [[processors.rename.replace]] +# field = "used_percent" +# dest = "usedPercentage" + #[[processors.rename.replace]] + # measurement = "net" + # dest = "nodes" + #[[processors.rename.replace]] + # field = "bytes_recv" + # dest = "networkBytesReceivedTotal" + #[[processors.rename.replace]] + # field = "bytes_sent" + # dest = "networkBytesSentTotal" + #[[processors.rename.replace]] + # field = "err_in" + # dest = "networkErrorsInTotal" + #[[processors.rename.replace]] + # field = "err_out" + # dest = "networkErrorsOutTotal" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_volume" + # dest = "pods" + #[[processors.rename.replace]] + # field = "used_bytes" + # dest = "podVolumeUsedBytes" + #[[processors.rename.replace]] + # field = "available_bytes" + # dest = "podVolumeAvailableBytes" + #[[processors.rename.replace]] + # measurement = "kubernetes_pod_network" + # dest = "pods" + #[[processors.rename.replace]] + # field = "tx_errors" + # dest = "podNetworkTxErrorsTotal" + #[[processors.rename.replace]] + # field = "rx_errors" + # dest = "podNetworkRxErrorsTotal" + #[[processors.rename.replace]] + # tag = "volume_name" + # dest = "volumeName" + #[[processors.rename.replace]] + # tag = "pod_name" + # dest = "podName" + #[[processors.rename.replace]] + # measurement = "docker" + # dest = "containers" + #[[processors.rename.replace]] + # measurement = "docker_container_status" + # dest = "containers" + #[[processors.rename.replace]] + # field = "n_containers" + # dest = "numContainers" + #[[processors.rename.replace]] + # field = "n_containers_running" + # dest = "numContainersRunning" + #[[processors.rename.replace]] + # field = "n_containers_stopped" + # dest = "numContainersStopped" + #[[processors.rename.replace]] + # field = "n_containers_paused" + # dest = "numContainersPaused" + #[[processors.rename.replace]] + # field = "n_images" + # dest = "numContainerImages" + +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_agregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## The set of buckets. +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurance of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +#[[inputs.cpu]] + ## Whether to report per-cpu stats or not +# percpu = false + ## Whether to report total system cpu stats or not +# totalcpu = true + ## If true, collect raw CPU time metrics. +# collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. +# report_active = true +# fieldpass = ["usage_active","cluster","node","host","device"] +# taginclude = ["cluster","cpu","node"] + + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] + fieldpass = ["free", "used", "used_percent"] + taginclude = ["device","path","hostName"] + # Below due to Bug - https://github.com/influxdata/telegraf/issues/5615 + # ORDER matters here!! - i.e the below should be the LAST modifier + [inputs.disk.tagdrop] + path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers"] + + +# Read metrics about memory usage +#[[inputs.mem]] +# fieldpass = ["used_percent", "cluster", "node","host","device"] +# taginclude = ["cluster","node"] + + +# Read metrics about network interface usage +#[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + # interfaces = ["eth0"] + ## + ## On linux systems telegraf also collects protocol stats. + ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. + ## +# ignore_protocol_stats = true + ## + #fieldpass = ["bytes_recv", "bytes_sent", "err_in", "err_out"] + #fieldpass = ["err_in", "err_out"] + #taginclude = ["interface","nodeName"] + +# Read metrics from the kubernetes kubelet api +#[[inputs.kubernetes]] + ## URL for the kubelet + #url = "http://1.1.1.1:10255" +# url = "http://placeholder_nodeip:10255" + + ## Use bearer token for authorization + # bearer_token = /path/to/bearer/token + + ## Set response_timeout (default 5 seconds) + # response_timeout = "5s" + + ## Optional TLS Config + # tls_ca = /path/to/cafile + # tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["used_bytes", "available_bytes", "tx_errors", "rx_errors" ] +# taginclude = ["volume_name","nodeName","namespace","pod_name"] +# Read metrics about docker containers +#[[inputs.docker]] + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/host/docker.sock" + + ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) +# gather_services = false + + ## Only collect metrics for these containers, collect all if empty +# container_names = [] + + ## Containers to include and exclude. Globs accepted. + ## Note that an empty array for both will include all containers +# container_name_include = [] +# container_name_exclude = [] + + ## Container states to include and exclude. Globs accepted. + ## When empty only containers in the "running" state will be captured. +# container_state_include = ['*'] + # container_state_exclude = [] + + ## Timeout for docker list, info, and stats commands +# timeout = "5s" + + ## Whether to report for each container per-device blkio (8:0, 8:1...) and + ## network (eth0, eth1, ...) stats or not +# perdevice = true + ## Whether to report for each container total blkio and network stats or not +# total = true + ## Which environment variables should we use as a tag + ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] + + ## docker labels to include and exclude as tags. Globs accepted. + ## Note that an empty array for both will include all labels as tags +# docker_label_include = [] +# docker_label_exclude = [] + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false +# fieldpass = ["n_containers", "n_containers_running", "n_containers_stopped", "n_containers_paused", "n_images"] + #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] +# taginclude = ["nodeName"] +[[inputs.exec]] + ## Commands array + interval = "15m" + commands = [ + "/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh" + ] + + ## Timeout for each command to complete. + timeout = "15s" + + ## measurement name suffix (for separating different commands) + name_suffix = "_telemetry" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + tagexclude = ["hostName"] + diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index c263aa505..996c7501a 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -34,6 +34,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/code/plugin/CAdvisorMetricsAPIClient.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_perf.rb; source/code/plugin/in_kube_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/code/plugin/in_cadvisor_perf.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/code/plugin/in_win_cadvisor_perf.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_services.rb; source/code/plugin/in_kube_services.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/code/plugin/in_kube_nodes.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/code/plugin/filter_inventory2mdm.rb; 644; root; root @@ -97,6 +98,8 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf.conf; installer/conf/telegraf.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root @@ -136,6 +139,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit; 755; root; root;sysdir /opt/td-agent-bit/bin; 755; root; root;sysdir +/etc/telegraf; 755; root; root;sysdir /opt/microsoft/omsagent/plugin/lib; 755; root; root; sysdir /opt/microsoft/omsagent/plugin/lib/application_insights; 755; root; root; sysdir diff --git a/installer/scripts/TelegrafTCPErrorTelemetry.sh b/installer/scripts/TelegrafTCPErrorTelemetry.sh new file mode 100644 index 000000000..2bd58b202 --- /dev/null +++ b/installer/scripts/TelegrafTCPErrorTelemetry.sh @@ -0,0 +1,3 @@ +#!/bin/sh +countErr=$(grep -iF "socket_writer" /var/opt/microsoft/docker-cimprov/log/telegraf.log | wc -l | tr -d '\n') +echo "telegraf,Source=telegrafErrLog telegrafTCPWriteErrorCountTotal=${countErr}i" \ No newline at end of file diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index 36cf20273..269d16111 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -23,10 +23,31 @@ import ( ) // DataType for Container Log -const DataType = "CONTAINER_LOG_BLOB" +const ContainerLogDataType = "CONTAINER_LOG_BLOB" + +// DataType for Insights metric +const InsightsMetricsDataType = "INSIGHTS_METRICS_BLOB" + +//env varibale which has ResourceId for LA +const ResourceIdEnv = "AKS_RESOURCE_ID" + +//env variable which has ResourceName for NON-AKS +const ResourceNameEnv = "ACS_RESOURCE_NAME" + +// Origin prefix for telegraf Metrics (used as prefix for origin field & prefix for azure monitor specific tags) +const TelegrafMetricOriginPrefix = "container.azm.ms" +// Origin suffix for telegraf Metrics (used as suffix for origin field) +const TelegrafMetricOriginSuffix = "telegraf" +// Namespace prefix for telegraf Metrics (used as prefix for Namespace field) +//const TelegrafMetricNamespacePrefix = "plugin" +// clusterName tag +const TelegrafTagClusterName = "clusterName" +// clusterId tag +const TelegrafTagClusterID = "clusterId" // ContainerLogPluginConfFilePath --> config file path for container log plugin -const ContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" +const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" +const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms-rs.conf" // IPName for Container Log const IPName = "Containers" @@ -44,6 +65,12 @@ var ( Computer string // WorkspaceID log analytics workspace id WorkspaceID string + // ResourceID for resource-centric log analytics data + ResourceID string + // Resource-centric flag (will be true if we determine if above RseourceID is non-empty - default is false) + ResourceCentric bool + //ResourceName + ResourceName string ) var ( @@ -88,6 +115,26 @@ type DataItem struct { Computer string `json:"Computer"` } +// telegraf metric DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin +type laTelegrafMetric struct { + // 'golden' fields + Origin string `json:"Origin"` + Namespace string `json:"Namespace"` + Name string `json:"Name"` + Value float64 `json:"Value"` + Tags string `json:"Tags"` + // specific required fields for LA + CollectionTime string `json:"CollectionTime"` //mapped to TimeGenerated + Computer string `json:"Computer"` +} + +// ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point +type InsightsMetricsBlob struct { + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []laTelegrafMetric `json:"DataItems"` +} + // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type ContainerLogBlob struct { DataType string `json:"DataType"` @@ -203,6 +250,174 @@ func updateKubeSystemContainerIDs() { } } +//Azure loganalytics metric values have to be numeric, so string values are dropped +func convert(in interface{}) (float64, bool) { + switch v := in.(type) { + case int64: + return float64(v), true + case uint64: + return float64(v), true + case float64: + return v, true + case bool: + if v { + return float64(1), true + } + return float64(0), true + default: + Log ("returning 0 for %v ", in) + return float64(0), false + } +} + +//Translates telegraf time series to one or more Azure loganalytics metric(s) +func translateTelegrafMetrics(m map[interface{}]interface{}) ([]*laTelegrafMetric, error) { + + var laMetrics []*laTelegrafMetric + var tags map[interface{}]interface{} + tags = m["tags"].(map[interface{}]interface{}) + tagMap := make(map[string]string) + for k, v := range tags { + key := fmt.Sprintf("%s",k) + if key == "" { + continue + } + tagMap[key] = fmt.Sprintf("%s",v) + } + + //add azure monitor tags + tagMap[fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafTagClusterID)] = ResourceID + tagMap[fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafTagClusterName)] = ResourceName + + var fieldMap map[interface{}]interface{} + fieldMap = m["fields"].(map[interface{}]interface{}) + + tagJson, err := json.Marshal(&tagMap) + + if err != nil { + return nil, err + } + + for k, v := range fieldMap { + fv, ok := convert(v) + if !ok { + continue + } + i := m["timestamp"].(uint64) + laMetric := laTelegrafMetric{ + Origin: fmt.Sprintf("%s/%s", TelegrafMetricOriginPrefix, TelegrafMetricOriginSuffix), + //Namespace: fmt.Sprintf("%s/%s", TelegrafMetricNamespacePrefix, m["name"]), + Namespace: fmt.Sprintf("%s", m["name"]), + Name: fmt.Sprintf("%s",k), + Value: fv, + Tags: fmt.Sprintf("%s", tagJson), + CollectionTime: time.Unix(int64(i),0).Format(time.RFC3339), + Computer: Computer, //this is the collection agent's computer name, not necessarily to which computer the metric applies to + } + + //Log ("la metric:%v", laMetric) + laMetrics = append(laMetrics, &laMetric) + } + return laMetrics, nil +} + +//send metrics from Telegraf to LA. 1) Translate telegraf timeseries to LA metric(s) 2) Send it to LA as 'InsightsMetrics' fixed type +func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int { + var laMetrics []*laTelegrafMetric + + if ( (telegrafRecords== nil) || ! (len(telegrafRecords) > 0) ) { + Log("PostTelegrafMetricsToLA::Error:no timeseries to derive") + return output.FLB_OK + } + + for _, record := range telegrafRecords { + translatedMetrics, err := translateTelegrafMetrics(record) + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when translating telegraf metric to log analytics metric %q", err) + Log(message) + //SendException(message) //This will be too noisy + } + laMetrics = append(laMetrics, translatedMetrics...) + } + + if ( (laMetrics == nil) || !(len(laMetrics) > 0) ) { + Log("PostTelegrafMetricsToLA::Info:no metrics derived from timeseries data") + return output.FLB_OK + } else { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Info:derived %v metrics from %v timeseries", len(laMetrics), len(telegrafRecords)) + Log(message) + } + + var metrics []laTelegrafMetric + var i int + + for i=0; i < len(laMetrics); i++ { + metrics = append(metrics, *laMetrics[i]) + } + + laTelegrafMetrics := InsightsMetricsBlob{ + DataType: InsightsMetricsDataType, + IPName: IPName, + DataItems: metrics} + + jsonBytes, err := json.Marshal(laTelegrafMetrics) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } + + //Post metrics data to LA + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) + + //req.URL.Query().Add("api-version","2016-04-01") + + //set headers + req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) + + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } + + start := time.Now() + resp, err := HTTPClient.Do(req) + elapsed := time.Since(start) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) + Log(message) + SendException(message) + UpdateNumTelegrafMetricsSentTelemetry(0, 1) + return output.FLB_RETRY + } + + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("PostTelegrafMetricsToLA::Error:(retriable) Response Status %v Status Code %v", resp.Status, resp.StatusCode) + } + UpdateNumTelegrafMetricsSentTelemetry(0, 1) + return output.FLB_RETRY + } + + defer resp.Body.Close() + + numMetrics := len(laMetrics) + UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0) + Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) + + return output.FLB_OK +} + +func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int) { + ContainerLogTelemetryMutex.Lock() + TelegrafMetricsSentCount += float64(numMetricsSent) + TelegrafMetricsSendErrorCount += float64(numSendErrors) + ContainerLogTelemetryMutex.Unlock() +} + // PostDataHelper sends data to the OMS endpoint func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { @@ -281,7 +496,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if len(dataItems) > 0 { logEntry := ContainerLogBlob{ - DataType: DataType, + DataType: ContainerLogDataType, IPName: IPName, DataItems: dataItems} @@ -294,6 +509,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) req.Header.Set("Content-Type", "application/json") + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } resp, err := HTTPClient.Do(req) elapsed := time.Since(start) @@ -376,9 +595,30 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { log.Fatalln(message) } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] - WorkspaceID = omsadminConf["WORKSPACE_ID"] Log("OMSEndpoint %s", OMSEndpoint) + WorkspaceID = omsadminConf["WORKSPACE_ID"] + ResourceID = os.Getenv("customResourceId") + + if len(ResourceID) > 0 { + //AKS Scenario + ResourceCentric = true + splitted := strings.Split(ResourceID, "/") + ResourceName = splitted[len(splitted)-1] + Log("ResourceCentric: True") + Log("ResourceID=%s",ResourceID) + Log("ResourceName=%s",ResourceID) + } + + if ResourceCentric == false { + //AKS-Engine/hybrid scenario + ResourceName = os.Getenv(ResourceNameEnv) + ResourceID = ResourceName + Log("ResourceCentric: False") + Log("ResourceID=%s",ResourceID) + Log("ResourceName=%s",ResourceName) + } + // Initialize image,name map refresh ticker containerInventoryRefreshInterval, err := strconv.Atoi(pluginConfig["container_inventory_refresh_interval"]) if err != nil { diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 133e0f039..dccc6774c 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -2,11 +2,13 @@ package main import ( "github.com/fluent/fluent-bit-go/output" + "github.com/Microsoft/ApplicationInsights-Go/appinsights" ) import ( "C" "strings" "unsafe" + "os" ) //export FLBPluginRegister @@ -19,8 +21,14 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { // ctx (context) pointer to fluentbit context (state/ c code) func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") - agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") - InitializePlugin(ContainerLogPluginConfFilePath, agentVersion) + agentVersion := os.Getenv("AGENT_VERSION") + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "replicaset") == 0 { + Log("Using %s for plugin config \n", ReplicaSetContainerLogPluginConfFilePath) + InitializePlugin(ReplicaSetContainerLogPluginConfFilePath, agentVersion) + } else { + Log("Using %s for plugin config \n", DaemonSetContainerLogPluginConfFilePath) + InitializePlugin(DaemonSetContainerLogPluginConfFilePath, agentVersion) + } enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushIntervalSeconds") @@ -51,9 +59,13 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { records = append(records, record) } - incomingTag := C.GoString(tag) - if strings.Contains(strings.ToLower(incomingTag), "oms.container.log.flbplugin") { - return PushToAppInsightsTraces(records) + incomingTag := strings.ToLower(C.GoString(tag)) + if strings.Contains(incomingTag, "oms.container.log.flbplugin") { + return PushToAppInsightsTraces(records, appinsights.Information, incomingTag) + } else if strings.Contains(incomingTag, "oms.container.perf.telegraf") { + return PostTelegrafMetricsToLA(records) + } else if strings.Contains(incomingTag, "oms.container.log.telegraf.err") { + return PushToAppInsightsTraces(records, appinsights.Error, incomingTag) } return PostDataHelper(records) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index a64ca2218..f507e4ab9 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -9,11 +9,12 @@ import ( "time" "github.com/Microsoft/ApplicationInsights-Go/appinsights" + "github.com/Microsoft/ApplicationInsights-Go/appinsights/contracts" "github.com/fluent/fluent-bit-go/output" ) var ( - // FlushedRecordsCount indicates the number of flushed records in the current period + // FlushedRecordsCount indicates the number of flushed log records in the current period FlushedRecordsCount float64 // FlushedRecordsTimeTaken indicates the cumulative time taken to flush the records for the current period FlushedRecordsTimeTaken float64 @@ -27,19 +28,23 @@ var ( TelemetryClient appinsights.TelemetryClient // ContainerLogTelemetryTicker sends telemetry periodically ContainerLogTelemetryTicker *time.Ticker + //Tracks the number of telegraf metrics sent successfully between telemetry ticker periods (uses ContainerLogTelemetryTicker) + TelegrafMetricsSentCount float64 + //Tracks the number of send errors between telemetry ticker periods (uses ContainerLogTelemetryTicker) + TelegrafMetricsSendErrorCount float64 ) const ( clusterTypeACS = "ACS" clusterTypeAKS = "AKS" - controllerTypeDaemonSet = "DaemonSet" - controllerTypeReplicaSet = "ReplicaSet" envAKSResourceID = "AKS_RESOURCE_ID" envACSResourceName = "ACS_RESOURCE_NAME" envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" defaultTelemetryPushIntervalSeconds = 300 @@ -63,9 +68,14 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { for ; true; <-ContainerLogTelemetryTicker.C { SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) elapsed := time.Since(start) + ContainerLogTelemetryMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 logRate := FlushedRecordsCount / float64(elapsed/time.Second) + telegrafMetricsSentCount := TelegrafMetricsSentCount + telegrafMetricsSendErrorCount := TelegrafMetricsSendErrorCount + TelegrafMetricsSentCount = 0.0 + TelegrafMetricsSendErrorCount = 0.0 FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 logLatencyMs := AgentLogProcessingMaxLatencyMs @@ -81,6 +91,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) logLatencyMetric.Properties["Container"] = logLatencyMsContainer TelemetryClient.Track(logLatencyMetric) + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofSendErrorsTelegrafMetrics, telegrafMetricsSendErrorCount)) start = time.Now() } } @@ -129,7 +141,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { CommonProperties = make(map[string]string) CommonProperties["Computer"] = Computer CommonProperties["WorkspaceID"] = WorkspaceID - CommonProperties["ControllerType"] = controllerTypeDaemonSet + CommonProperties["ControllerType"] = os.Getenv("CONTROLLER_TYPE") CommonProperties["AgentVersion"] = agentVersion aksResourceID := os.Getenv(envAKSResourceID) @@ -164,13 +176,15 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } // PushToAppInsightsTraces sends the log lines as trace messages to the configured App Insights Instance -func PushToAppInsightsTraces(records []map[interface{}]interface{}) int { +func PushToAppInsightsTraces(records []map[interface{}]interface{}, severityLevel contracts.SeverityLevel, tag string) int { var logLines []string for _, record := range records { logLines = append(logLines, ToString(record["log"])) } traceEntry := strings.Join(logLines, "\n") - TelemetryClient.TrackTrace(traceEntry, 1) + traceTelemetryItem := appinsights.NewTraceTelemetry(traceEntry, severityLevel) + traceTelemetryItem.Properties["tag"] = tag + TelemetryClient.Track(traceTelemetryItem) return output.FLB_OK } diff --git a/source/code/plugin/ApplicationInsightsUtility.rb b/source/code/plugin/ApplicationInsightsUtility.rb index 5c5e92a6c..5dc2bfab8 100644 --- a/source/code/plugin/ApplicationInsightsUtility.rb +++ b/source/code/plugin/ApplicationInsightsUtility.rb @@ -2,209 +2,222 @@ # frozen_string_literal: true class ApplicationInsightsUtility - require_relative 'lib/application_insights' - require_relative 'omslog' - require_relative 'DockerApiClient' - require_relative 'oms_common' - require 'json' - require 'base64' + require_relative "lib/application_insights" + require_relative "omslog" + require_relative "DockerApiClient" + require_relative "oms_common" + require "json" + require "base64" - @@HeartBeat = 'HeartBeatEvent' - @@Exception = 'ExceptionEvent' - @@AcsClusterType = 'ACS' - @@AksClusterType = 'AKS' - @OmsAdminFilePath = '/etc/opt/microsoft/omsagent/conf/omsadmin.conf' - @@EnvAcsResourceName = 'ACS_RESOURCE_NAME' - @@EnvAksRegion = 'AKS_REGION' - @@EnvAgentVersion = 'AGENT_VERSION' - @@EnvApplicationInsightsKey = 'APPLICATIONINSIGHTS_AUTH' - @@EnvControllerType = 'CONTROLLER_TYPE' + @@HeartBeat = "HeartBeatEvent" + @@Exception = "ExceptionEvent" + @@AcsClusterType = "ACS" + @@AksClusterType = "AKS" + @OmsAdminFilePath = "/etc/opt/microsoft/omsagent/conf/omsadmin.conf" + @@EnvAcsResourceName = "ACS_RESOURCE_NAME" + @@EnvAksRegion = "AKS_REGION" + @@EnvAgentVersion = "AGENT_VERSION" + @@EnvApplicationInsightsKey = "APPLICATIONINSIGHTS_AUTH" + @@EnvControllerType = "CONTROLLER_TYPE" - @@CustomProperties = {} - @@Tc = nil - @@hostName = (OMS::Common.get_hostname) + @@CustomProperties = {} + @@Tc = nil + @@hostName = (OMS::Common.get_hostname) - def initialize - end + def initialize + end - class << self - #Set default properties for telemetry event - def initializeUtility() - begin - resourceInfo = ENV['AKS_RESOURCE_ID'] - if resourceInfo.nil? || resourceInfo.empty? - @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName] - @@CustomProperties["ClusterType"] = @@AcsClusterType - @@CustomProperties["SubscriptionID"] = "" - @@CustomProperties["ResourceGroupName"] = "" - @@CustomProperties["ClusterName"] = "" - @@CustomProperties["Region"] = "" - else - @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo - begin - splitStrings = resourceInfo.split('/') - subscriptionId = splitStrings[2] - resourceGroupName = splitStrings[4] - clusterName = splitStrings[8] - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}") - end - @@CustomProperties["ClusterType"] = @@AksClusterType - @@CustomProperties["SubscriptionID"] = subscriptionId - @@CustomProperties["ResourceGroupName"] = resourceGroupName - @@CustomProperties["ClusterName"] = clusterName - @@CustomProperties["Region"] = ENV[@@EnvAksRegion] - end + class << self + #Set default properties for telemetry event + def initializeUtility() + begin + resourceInfo = ENV["AKS_RESOURCE_ID"] + if resourceInfo.nil? || resourceInfo.empty? + @@CustomProperties["ACSResourceName"] = ENV[@@EnvAcsResourceName] + @@CustomProperties["ClusterType"] = @@AcsClusterType + @@CustomProperties["SubscriptionID"] = "" + @@CustomProperties["ResourceGroupName"] = "" + @@CustomProperties["ClusterName"] = "" + @@CustomProperties["Region"] = "" + else + @@CustomProperties["AKS_RESOURCE_ID"] = resourceInfo + begin + splitStrings = resourceInfo.split("/") + subscriptionId = splitStrings[2] + resourceGroupName = splitStrings[4] + clusterName = splitStrings[8] + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: parsing AKS resourceId: #{resourceInfo}, error: #{errorStr}") + end + @@CustomProperties["ClusterType"] = @@AksClusterType + @@CustomProperties["SubscriptionID"] = subscriptionId + @@CustomProperties["ResourceGroupName"] = resourceGroupName + @@CustomProperties["ClusterName"] = clusterName + @@CustomProperties["Region"] = ENV[@@EnvAksRegion] + end - getDockerInfo() - @@CustomProperties['WorkspaceID'] = getWorkspaceId - @@CustomProperties['AgentVersion'] = ENV[@@EnvAgentVersion] - @@CustomProperties['ControllerType'] = ENV[@@EnvControllerType] - encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] + #Commenting it for now from initilize method, we need to pivot all telemetry off of kubenode docker version + #getDockerInfo() + @@CustomProperties["WorkspaceID"] = getWorkspaceId + @@CustomProperties["AgentVersion"] = ENV[@@EnvAgentVersion] + @@CustomProperties["ControllerType"] = ENV[@@EnvControllerType] + encodedAppInsightsKey = ENV[@@EnvApplicationInsightsKey] - #Check if telemetry is turned off - telemetryOffSwitch = ENV['DISABLE_TELEMETRY'] - if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase - $log.warn("AppInsightsUtility: Telemetry is disabled") - @@Tc = ApplicationInsights::TelemetryClient.new - elsif !encodedAppInsightsKey.nil? - decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) - @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey - - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") - end + #Check if telemetry is turned off + telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] + if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase + $log.warn("AppInsightsUtility: Telemetry is disabled") + @@Tc = ApplicationInsights::TelemetryClient.new + elsif !encodedAppInsightsKey.nil? + decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) + @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: initilizeUtility - error: #{errorStr}") + end + end - def getDockerInfo() - dockerInfo = DockerApiClient.dockerInfo - if (!dockerInfo.nil? && !dockerInfo.empty?) - @@CustomProperties['DockerVersion'] = dockerInfo['Version'] - @@CustomProperties['DockerApiVersion'] = dockerInfo['ApiVersion'] - end - end + def getDockerInfo() + dockerInfo = DockerApiClient.dockerInfo + if (!dockerInfo.nil? && !dockerInfo.empty?) + @@CustomProperties["DockerVersion"] = dockerInfo["Version"] + #@@CustomProperties["DockerApiVersion"] = dockerInfo["ApiVersion"] + end + end - def sendHeartBeatEvent(pluginName) - begin - eventName = pluginName + @@HeartBeat - if !(@@Tc.nil?) - @@Tc.track_event eventName , :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Heartbeat Telemetry sent successfully") - end - rescue =>errorStr - $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") - end + def sendHeartBeatEvent(pluginName) + begin + eventName = pluginName + @@HeartBeat + if !(@@Tc.nil?) + @@Tc.track_event eventName, :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Heartbeat Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendHeartBeatEvent - error: #{errorStr}") + end + end - def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) - begin - if !(@@Tc.nil?) - @@Tc.track_metric 'LastProcessedContainerInventoryCount', properties['ContainerCount'], - :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, - :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Container Count Telemetry sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") - end + def sendLastProcessedContainerInventoryCountMetric(pluginName, properties) + begin + if !(@@Tc.nil?) + @@Tc.track_metric "LastProcessedContainerInventoryCount", properties["ContainerCount"], + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => @@CustomProperties + @@Tc.flush + $log.info("AppInsights Container Count Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomMetric - error: #{errorStr}") + end + end - def sendCustomEvent(eventName, properties) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - end - if !(@@Tc.nil?) - @@Tc.track_event eventName, :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Custom Event #{eventName} sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") - end + def sendCustomEvent(eventName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_event eventName, :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights Custom Event #{eventName} sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendCustomEvent - error: #{errorStr}") + end + end - def sendExceptionTelemetry(errorStr) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - if !(@@Tc.nil?) - @@Tc.track_exception errorStr , :properties => @@CustomProperties - @@Tc.flush - $log.info("AppInsights Exception Telemetry sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") - end + def sendExceptionTelemetry(errorStr, properties = nil) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() + end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_exception errorStr, :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights Exception Telemetry sent successfully") end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendExceptionTelemetry - error: #{errorStr}") + end + end - #Method to send heartbeat and container inventory count - def sendTelemetry(pluginName, properties) - begin - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - @@CustomProperties['Computer'] = properties['Computer'] - sendHeartBeatEvent(pluginName) - sendLastProcessedContainerInventoryCountMetric(pluginName, properties) - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") - end + #Method to send heartbeat and container inventory count + def sendTelemetry(pluginName, properties) + begin + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() end + @@CustomProperties["Computer"] = properties["Computer"] + sendHeartBeatEvent(pluginName) + sendLastProcessedContainerInventoryCountMetric(pluginName, properties) + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendTelemetry - error: #{errorStr}") + end + end - #Method to send metric. It will merge passed-in properties with common custom properties - def sendMetricTelemetry(metricName, metricValue, properties) - begin - if (metricName.empty? || metricName.nil?) - $log.warn("SendMetricTelemetry: metricName is missing") - return - end - if @@CustomProperties.empty? || @@CustomProperties.nil? - initializeUtility() - elsif @@CustomProperties['DockerVersion'].nil? - getDockerInfo() - end - telemetryProps = {} - telemetryProps["Computer"] = @@hostName - # add common dimensions - @@CustomProperties.each{ |k,v| telemetryProps[k]=v} - # add passed-in dimensions if any - if (!properties.nil? && !properties.empty?) - properties.each{ |k,v| telemetryProps[k]=v} - end - if !(@@Tc.nil?) - @@Tc.track_metric metricName, metricValue, - :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, - :properties => telemetryProps - @@Tc.flush - $log.info("AppInsights metric Telemetry #{metricName} sent successfully") - end - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") - end + #Method to send metric. It will merge passed-in properties with common custom properties + def sendMetricTelemetry(metricName, metricValue, properties) + begin + if (metricName.empty? || metricName.nil?) + $log.warn("SendMetricTelemetry: metricName is missing") + return end + if @@CustomProperties.empty? || @@CustomProperties.nil? + initializeUtility() + elsif @@CustomProperties["DockerVersion"].nil? + getDockerInfo() + end + telemetryProps = {} + # add common dimensions + @@CustomProperties.each { |k, v| telemetryProps[k] = v } + # add passed-in dimensions if any + if (!properties.nil? && !properties.empty?) + properties.each { |k, v| telemetryProps[k] = v } + end + if !(@@Tc.nil?) + @@Tc.track_metric metricName, metricValue, + :kind => ApplicationInsights::Channel::Contracts::DataPointType::MEASUREMENT, + :properties => telemetryProps + @@Tc.flush + $log.info("AppInsights metric Telemetry #{metricName} sent successfully") + end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: sendMetricTelemetry - error: #{errorStr}") + end + end - def getWorkspaceId() - begin - adminConf = {} - confFile = File.open(@OmsAdminFilePath, "r") - confFile.each_line do |line| - splitStrings = line.split('=') - adminConf[splitStrings[0]] = splitStrings[1] - end - workspaceId = adminConf['WORKSPACE_ID'] - return workspaceId - rescue => errorStr - $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") - end + def getWorkspaceId() + begin + adminConf = {} + confFile = File.open(@OmsAdminFilePath, "r") + confFile.each_line do |line| + splitStrings = line.split("=") + adminConf[splitStrings[0]] = splitStrings[1] end + workspaceId = adminConf["WORKSPACE_ID"] + return workspaceId + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") + end end -end \ No newline at end of file + end +end diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index 3c36775af..35cf727cf 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -2,424 +2,629 @@ # frozen_string_literal: true class CAdvisorMetricsAPIClient - - require 'json' - require 'logger' - require 'net/http' - require 'net/https' - require 'uri' - require 'date' - - require_relative 'oms_common' - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" - @Log = Logger.new(@LogPath, 2, 10*1048576) #keep last 2 files, max log file size = 10M - @@rxBytesLast = nil - @@rxBytesTimeLast = nil - @@txBytesLast = nil - @@txBytesTimeLast = nil - @@nodeCpuUsageNanoSecondsLast = nil - @@nodeCpuUsageNanoSecondsTimeLast = nil - @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - - - def initialize + require "json" + require "logger" + require "net/http" + require "net/https" + require "uri" + require "date" + + require_relative "oms_common" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + # @@rxBytesLast = nil + # @@rxBytesTimeLast = nil + # @@txBytesLast = nil + # @@txBytesTimeLast = nil + @@nodeCpuUsageNanoSecondsLast = nil + @@nodeCpuUsageNanoSecondsTimeLast = nil + @@winNodeCpuUsageNanoSecondsLast = {} + @@winNodeCpuUsageNanoSecondsTimeLast = {} + @@winContainerCpuUsageNanoSecondsLast = {} + @@winContainerCpuUsageNanoSecondsTimeLast = {} + @@winContainerPrevMetricRate = {} + @@linuxNodePrevMetricRate = nil + @@winNodePrevMetricRate = {} + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + + #Containers a hash of node name and the last time telemetry was sent for this node + @@nodeTelemetryTimeTracker = {} + + # Keeping track of containers so that can delete the container from the container cpu cache when the container is deleted + # as a part of the cleanup routine + @@winContainerIdCache = [] + + def initialize + end + + class << self + def getSummaryStatsFromCAdvisor(winNode) + headers = {} + response = nil + @Log.info "Getting CAdvisor Uri" + begin + cAdvisorUri = getCAdvisorUri(winNode) + if !cAdvisorUri.nil? + uri = URI.parse(cAdvisorUri) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = false + + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" + end + rescue => error + @Log.warn("CAdvisor api request failed: #{error}") + telemetryProps = {} + telemetryProps["Computer"] = winNode["Hostname"] + ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) + end + return response + end + + def getCAdvisorUri(winNode) + begin + defaultHost = "http://localhost:10255" + relativeUri = "/stats/summary" + if !winNode.nil? + nodeIP = winNode["InternalIP"] + else + nodeIP = ENV["NODE_IP"] + end + if !nodeIP.nil? + @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") + return "http://#{nodeIP}:10255" + relativeUri + else + @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") + if !winNode.nil? + return nil + else + return defaultHost + relativeUri + end + end + end + end + + def getMetrics(winNode = nil) + metricDataItems = [] + begin + if !winNode.nil? + hostName = winNode["Hostname"] + operatingSystem = "Windows" + else + hostName = (OMS::Common.get_hostname) + operatingSystem = "Linux" + end + cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !cAdvisorStats.nil? + metricInfo = JSON.parse(cAdvisorStats.body) + end + if !metricInfo.nil? + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) + metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) + + if operatingSystem == "Linux" + metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", "cpuUsageNanoCores")) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) + elsif operatingSystem == "Windows" + containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", "cpuUsageNanoCores") + if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? + metricDataItems.concat(containerCpuUsageNanoSecondsRate) end - - class << self - def getSummaryStatsFromCAdvisor() - headers = {} - response = nil - @Log.info 'Getting CAdvisor Uri' - begin - cAdvisorUri = getCAdvisorUri() - if !cAdvisorUri.nil? - uri = URI.parse(cAdvisorUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = false - - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" - end - rescue => error - @Log.warn("CAdvisor api request failed: #{error}") - end - return response - end - - def getCAdvisorUri() - begin - defaultHost = "http://localhost:10255" - relativeUri = "/stats/summary" - nodeIP = ENV['NODE_IP'] - if !nodeIP.nil? - @Log.info("Using #{nodeIP + relativeUri} for CAdvisor Uri") - return "http://#{nodeIP}:10255" + relativeUri - else - @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost + relativeUri} ") - return defaultHost + relativeUri - end - end - end - - def getMetrics() - metricDataItems = [] - begin - hostName = (OMS::Common.get_hostname) - metricInfo = JSON.parse(getSummaryStatsFromCAdvisor().body) - metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores","cpuUsageNanoCores")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", "memoryRssBytes")) - metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) - - cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores") - if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? - metricDataItems.push(cpuUsageNanoSecondsRate) - end - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", "memoryRssBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) - metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) - - networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") - if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? - metricDataItems.push(networkRxRate) - end - networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") - if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? - metricDataItems.push(networkTxRate) - end - - - rescue => error - @Log.warn("getContainerMetrics failed: #{error}") - return metricDataItems - end - return metricDataItems - end + end - def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - podName = pod['podRef']['name'] - podNamespace = pod['podRef']['namespace'] - - if (!pod['containers'].nil?) - pod['containers'].each do |container| - #cpu metric - containerName = container['name'] - metricValue = container['cpu'][cpuMetricNameToCollect] - metricTime = container['cpu']['time'] - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #Telemetry about agent performance - begin - # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers - # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use - if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("cpuUsageNanoCores")) - - if (timeDifferenceInMinutes >= 10) - telemetryProps = {} - telemetryProps['PodName'] = podName - telemetryProps['ContainerName'] = containerName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") - end - end - end - end - # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) - if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) - @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i - end - rescue => error - @Log.warn("getcontainerCpuMetricItems failed: #{error} for metric #{cpuMetricNameToCollect}") - return metricItems - end - return metricItems - end + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", "cpuUsageNanoCores", operatingSystem) + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", "memoryWorkingSetBytes")) - def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - podName = pod['podRef']['name'] - podNamespace = pod['podRef']['namespace'] - if (!pod['containers'].nil?) - pod['containers'].each do |container| - containerName = container['name'] - metricValue = container['memory'][memoryMetricNameToCollect] - metricTime = container['memory']['time'] - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #Telemetry about agent performance - begin - # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers - # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use - if (podName.downcase.start_with?('omsagent-') && podNamespace.eql?("kube-system") && containerName.downcase.start_with?('omsagent') && metricNametoReturn.eql?("memoryRssBytes")) - if (timeDifferenceInMinutes >= 10) - telemetryProps = {} - telemetryProps['PodName'] = podName - telemetryProps['ContainerName'] = containerName - ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) - end - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getcontainerMemoryMetricItems failed: #{errorStr} for metric #{memoryMetricNameToCollect}") - end - end - end - end - # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) - if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("memoryRssBytes")) - @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i - end - rescue => error - @Log.warn("getcontainerMemoryMetricItems failed: #{error} for metric #{memoryMetricNameToCollect}") - @Log.warn metricJSON - return metricItems - end - return metricItems - end + metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch")) + + # Disabling networkRxRate and networkTxRate since we dont use it as of now. + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) + # networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") + # if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? + # metricDataItems.push(networkRxRate) + # end + # networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") + # if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? + # metricDataItems.push(networkTxRate) + # end + else + @Log.warn("Couldn't get metric information for host: #{hostName}") + end + rescue => error + @Log.warn("getContainerMetrics failed: #{error}") + return metricDataItems + end + return metricDataItems + end + + def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #cpu metric + containerName = container["name"] + metricValue = container["cpu"][cpuMetricNameToCollect] + metricTime = container["cpu"]["time"] + metricItem = {} + metricItem["DataItems"] = [] - def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - begin - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - - metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]['time'] - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") - @Log.warn metricJSON - return metricItem - end - return metricItem + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("cpuUsageNanoCores")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + telemetryProps["Computer"] = hostName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerCpuMetricItems failed: #{errorStr} for metric #{cpuMetricNameToCollect}") + end + end + end + end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("cpuUsageNanoCores")) + @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i + end + rescue => error + @Log.warn("getcontainerCpuMetricItems failed: #{error} for metric #{cpuMetricNameToCollect}") + return metricItems + end + return metricItems + end + + def clearDeletedWinContainersFromCache() + begin + winCpuUsageNanoSecondsKeys = @@winContainerCpuUsageNanoSecondsLast.keys + winCpuUsageNanoSecondsTimeKeys = @@winContainerCpuUsageNanoSecondsTimeLast.keys + + # Find the container ids to be deleted from cache + winContainersToBeCleared = winCpuUsageNanoSecondsKeys - @@winContainerIdCache + if winContainersToBeCleared.length > 0 + @Log.warn "Stale containers found in cache, clearing...: #{winContainersToBeCleared}" + end + winContainersToBeCleared.each do |containerId| + @@winContainerCpuUsageNanoSecondsLast.delete(containerId) + @@winContainerCpuUsageNanoSecondsTimeLast.delete(containerId) + end + rescue => errorStr + @Log.warn("clearDeletedWinContainersFromCache failed: #{errorStr}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def resetWinContainerIdCache + @@winContainerIdCache = [] + end + + # usageNanoCores doesnt exist for windows nodes. Hence need to compute this from usageCoreNanoSeconds + def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryCpuMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + @Log.warn "in host: #{hostName}" + begin + metricInfo = metricJSON + containerCount = 0 + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + if (!pod["containers"].nil?) + pod["containers"].each do |container| + #cpu metric + containerCount += 1 + containerName = container["name"] + metricValue = container["cpu"][cpuMetricNameToCollect] + metricTime = container["cpu"]["time"] + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn - def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - begin - - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - metricValue = node[metricCategory][metricNameToCollect] - metricTime = node[metricCategory]['time'] - - if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds" ) - @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") - return nil - elsif metricNameToCollect == "rxBytes" - if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@rxBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) - @@rxBytesLast = metricValue - @@rxBytesTimeLast = metricTime - metricValue = metricRateValue - end - elsif metricNameToCollect == "txBytes" - if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@txBytesLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) - @@txBytesLast = metricValue - @@txBytesTimeLast = metricTime - metricValue = metricRateValue - end - else - if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - return nil - else - metricRateValue = ((metricValue - @@nodeCpuUsageNanoSecondsLast) * 1.0)/(DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time) - @@nodeCpuUsageNanoSecondsLast = metricValue - @@nodeCpuUsageNanoSecondsTimeLast = metricTime - metricValue = metricRateValue - end - end - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") - @Log.warn metricJSON - return nil - end - return metricItem + containerId = podUid + "/" + containerName + # Adding the containers to the winContainerIdCache so that it can be used by the cleanup routine + # to clear the delted containers every 5 minutes + @@winContainerIdCache.push(containerId) + if @@winContainerCpuUsageNanoSecondsLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsTimeLast[containerId].nil? || @@winContainerCpuUsageNanoSecondsLast[containerId] > metricValue #when kubelet is restarted the last condition will be true + @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue + @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime + next + else + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winContainerCpuUsageNanoSecondsTimeLast[containerId]).to_time + containerCpuUsageDifference = metricValue - @@winContainerCpuUsageNanoSecondsLast[containerId] + # containerCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && containerCpuUsageDifference != 0 + metricRateValue = (containerCpuUsageDifference * 1.0) / timeDifference + else + @Log.info "container - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@winContainerPrevMetricRate[containerId].nil? + metricRateValue = @@winContainerPrevMetricRate[containerId] + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end end + @@winContainerCpuUsageNanoSecondsLast[containerId] = metricValue + @@winContainerCpuUsageNanoSecondsTimeLast[containerId] = metricTime + metricValue = metricRateValue + @@winContainerPrevMetricRate[containerId] = metricRateValue + end - def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) - metricItem = {} - clusterId = KubernetesApiClient.getClusterId - - begin - metricInfo = metricJSON - node = metricInfo['node'] - nodeName = node['nodeName'] - - - metricValue = node['startTime'] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + nodeName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - #Read it from /proc/uptime - metricCollections['Value'] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - - rescue => error - @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") - @Log.warn metricJSON - return metricItem - end - return metricItem + metricCollections["Value"] = metricValue + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + end + end + end + #Sending ContainerInventoryTelemetry from replicaset for telemetry purposes + if @@nodeTelemetryTimeTracker[hostName].nil? + @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i + else + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker[hostName]).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@nodeTelemetryTimeTracker[hostName] = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties["Computer"] = hostName + telemetryProperties["ContainerCount"] = containerCount + telemetryProperties["OS"] = "Windows" + # Hardcoding the event to ContainerInventory hearbeat event since the telemetry is pivoted off of this event. + @Log.info "sending container inventory heartbeat telemetry" + ApplicationInsightsUtility.sendCustomEvent("ContainerInventoryHeartBeatEvent", telemetryProperties) + end + end + rescue => error + @Log.warn("getcontainerCpuMetricItemRate failed: #{error} for metric #{cpuMetricNameToCollect}") + return metricItems + end + return metricItems + end + + def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollect, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + timeDifference = (DateTime.now.to_time.to_i - @@telemetryMemoryMetricTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + metricValue = container["memory"][memoryMetricNameToCollect] + metricTime = container["memory"]["time"] + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #Telemetry about agent performance + begin + # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers + # cadvisor does not have pod/container metadata. so would need more work to cache as pv & use + if (podName.downcase.start_with?("omsagent-") && podNamespace.eql?("kube-system") && containerName.downcase.start_with?("omsagent") && metricNametoReturn.eql?("memoryRssBytes")) + if (timeDifferenceInMinutes >= 10) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + telemetryProps["Computer"] = hostName + ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) + end end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getcontainerMemoryMetricItems failed: #{errorStr} for metric #{memoryMetricNameToCollect}") + end + end + end + end + # reset time outside pod iterator as we use one timer per metric for 2 pods (ds & rs) + if (timeDifferenceInMinutes >= 10 && metricNametoReturn.eql?("memoryRssBytes")) + @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + end + rescue => error + @Log.warn("getcontainerMemoryMetricItems failed: #{error} for metric #{memoryMetricNameToCollect}") + @Log.warn metricJSON + return metricItems + end + return metricItems + end + + def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] + + if !node[metricCategory].nil? + metricValue = node[metricCategory][metricNameToCollect] + metricTime = node[metricCategory]["time"] + + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + end + rescue => error + @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") + @Log.warn metricJSON + return metricItem + end + return metricItem + end + + def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToCollect, metricNametoReturn, operatingSystem) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] - def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) - metricItems = [] - clusterId = KubernetesApiClient.getClusterId - currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - begin - metricInfo = metricJSON - metricInfo['pods'].each do |pod| - podUid = pod['podRef']['uid'] - if (!pod['containers'].nil?) - pod['containers'].each do |container| - containerName = container['name'] - metricValue = container['startTime'] - metricTime = currentTime - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = hostName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = DateTime.parse(metricValue).to_time.to_i - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - end - end - end - rescue => error - @Log.warn("getContainerStartTimeMetric failed: #{error} for metric #{metricNametoReturn}") - @Log.warn metricJSON - return metricItems - end - return metricItems + if !node[metricCategory].nil? + metricValue = node[metricCategory][metricNameToCollect] + metricTime = node[metricCategory]["time"] + + # if !(metricNameToCollect == "rxBytes" || metricNameToCollect == "txBytes" || metricNameToCollect == "usageCoreNanoSeconds") + # @Log.warn("getNodeMetricItemRate : rateMetric is supported only for rxBytes, txBytes & usageCoreNanoSeconds and not for #{metricNameToCollect}") + if !(metricNameToCollect == "usageCoreNanoSeconds") + @Log.warn("getNodeMetricItemRate : rateMetric is supported only for usageCoreNanoSeconds and not for #{metricNameToCollect}") + return nil + # elsif metricNameToCollect == "rxBytes" + # if @@rxBytesLast.nil? || @@rxBytesTimeLast.nil? || @@rxBytesLast > metricValue #when kubelet is restarted the last condition will be true + # @@rxBytesLast = metricValue + # @@rxBytesTimeLast = metricTime + # return nil + # else + # metricRateValue = ((metricValue - @@rxBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@rxBytesTimeLast).to_time) + # @@rxBytesLast = metricValue + # @@rxBytesTimeLast = metricTime + # metricValue = metricRateValue + # end + # elsif metricNameToCollect == "txBytes" + # if @@txBytesLast.nil? || @@txBytesTimeLast.nil? || @@txBytesLast > metricValue #when kubelet is restarted the last condition will be true + # @@txBytesLast = metricValue + # @@txBytesTimeLast = metricTime + # return nil + # else + # metricRateValue = ((metricValue - @@txBytesLast) * 1.0) / (DateTime.parse(metricTime).to_time - DateTime.parse(@@txBytesTimeLast).to_time) + # @@txBytesLast = metricValue + # @@txBytesTimeLast = metricTime + # metricValue = metricRateValue + # end + else + if operatingSystem == "Linux" + if @@nodeCpuUsageNanoSecondsLast.nil? || @@nodeCpuUsageNanoSecondsTimeLast.nil? || @@nodeCpuUsageNanoSecondsLast > metricValue #when kubelet is restarted the last condition will be true + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + return nil + else + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@nodeCpuUsageNanoSecondsTimeLast).to_time + nodeCpuUsageDifference = metricValue - @@nodeCpuUsageNanoSecondsLast + # nodeCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && nodeCpuUsageDifference != 0 + metricRateValue = (nodeCpuUsageDifference * 1.0) / timeDifference + else + @Log.info "linux node - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@linuxNodePrevMetricRate.nil? + metricRateValue = @@linuxNodePrevMetricRate + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end + end + @@nodeCpuUsageNanoSecondsLast = metricValue + @@nodeCpuUsageNanoSecondsTimeLast = metricTime + @@linuxNodePrevMetricRate = metricRateValue + metricValue = metricRateValue + end + elsif operatingSystem == "Windows" + # Using the hash for windows nodes since this is running in replica set and there can be multiple nodes + if @@winNodeCpuUsageNanoSecondsLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsTimeLast[hostName].nil? || @@winNodeCpuUsageNanoSecondsLast[hostName] > metricValue #when kubelet is restarted the last condition will be true + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + return nil + else + timeDifference = DateTime.parse(metricTime).to_time - DateTime.parse(@@winNodeCpuUsageNanoSecondsTimeLast[hostName]).to_time + nodeCpuUsageDifference = metricValue - @@winNodeCpuUsageNanoSecondsLast[hostName] + # nodeCpuUsageDifference check is added to make sure we report non zero values when cadvisor returns same values for subsequent calls + if timeDifference != 0 && nodeCpuUsageDifference != 0 + metricRateValue = (nodeCpuUsageDifference * 1.0) / timeDifference + else + @Log.info "windows node - cpu usage difference / time difference is 0, hence using previous cached value" + if !@@winNodePrevMetricRate[hostName].nil? + metricRateValue = @@winNodePrevMetricRate[hostName] + else + # This can happen when the metric value returns same values for subsequent calls when the plugin first starts + metricRateValue = 0 + end end + @@winNodeCpuUsageNanoSecondsLast[hostName] = metricValue + @@winNodeCpuUsageNanoSecondsTimeLast[hostName] = metricTime + @@winNodePrevMetricRate[hostName] = metricRateValue + metricValue = metricRateValue + end + end + end + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + end + rescue => error + @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") + @Log.warn metricJSON + return nil + end + return metricItem + end + + def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn) + metricItem = {} + clusterId = KubernetesApiClient.getClusterId + + begin + metricInfo = metricJSON + node = metricInfo["node"] + nodeName = node["nodeName"] + + metricValue = node["startTime"] + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + #Read it from /proc/uptime + metricCollections["Value"] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + rescue => error + @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") + @Log.warn metricJSON + return metricItem + end + return metricItem + end + + def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + currentTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + metricValue = container["startTime"] + metricTime = currentTime + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = hostName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = DateTime.parse(metricValue).to_time.to_i + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) end + end end + rescue => error + @Log.warn("getContainerStartTimeMetric failed: #{error} for metric #{metricNametoReturn}") + @Log.warn metricJSON + return metricItems + end + return metricItems + end + end +end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index a1e143b15..3c6b4f203 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -2,474 +2,516 @@ # frozen_string_literal: true class KubernetesApiClient + require "json" + require "logger" + require "net/http" + require "net/https" + require "uri" + require "time" - require 'json' - require 'logger' - require 'net/http' - require 'net/https' - require 'uri' - require 'time' - - require_relative 'oms_common' - - @@ApiVersion = "v1" - @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - @@ClusterName = nil - @@ClusterId = nil - @@IsNodeMaster = nil - #@@IsValidRunningNode = nil - #@@IsLinuxCluster = nil - @@KubeSystemNamespace = "kube-system" - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" - @Log = Logger.new(@LogPath, 2, 10*1048576) #keep last 2 files, max log file size = 10M - @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" - @@TokenStr = nil - @@NodeMetrics = Hash.new - - def initialize + require_relative "oms_common" + + @@ApiVersion = "v1" + @@CaFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + @@ClusterName = nil + @@ClusterId = nil + @@IsNodeMaster = nil + #@@IsValidRunningNode = nil + #@@IsLinuxCluster = nil + @@KubeSystemNamespace = "kube-system" + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M + @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" + @@TokenStr = nil + @@NodeMetrics = Hash.new + @@WinNodeArray = [] + + def initialize + end + + class << self + def getKubeResourceInfo(resource) + headers = {} + response = nil + @Log.info "Getting Kube resource" + @Log.info resource + begin + resourceUri = getResourceUri(resource) + if !resourceUri.nil? + uri = URI.parse(resourceUri) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + if !File.exist?(@@CaFile) + raise "#{@@CaFile} doesnt exist" + else + http.ca_file = @@CaFile if File.exist?(@@CaFile) + end + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + + kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) + kubeApiRequest["Authorization"] = "Bearer " + getTokenStr + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" + response = http.request(kubeApiRequest) + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" end + rescue => error + @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") + end + if (response.body.empty?) + @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") + end + return response + end - class << self - def getKubeResourceInfo(resource) - headers = {} - response = nil - @Log.info 'Getting Kube resource' - @Log.info resource - begin - resourceUri = getResourceUri(resource) - if !resourceUri.nil? - uri = URI.parse(resourceUri) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = true - if !File.exist?(@@CaFile) - raise "#{@@CaFile} doesnt exist" - else - http.ca_file = @@CaFile if File.exist?(@@CaFile) - end - http.verify_mode = OpenSSL::SSL::VERIFY_PEER - - kubeApiRequest = Net::HTTP::Get.new(uri.request_uri) - kubeApiRequest['Authorization'] = "Bearer " + getTokenStr - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" - response = http.request(kubeApiRequest) - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" - end - rescue => error - @Log.warn("kubernetes api request failed: #{error} for #{resource} @ #{Time.now.utc.iso8601}") - end - if (response.body.empty?) - @Log.warn("KubernetesAPIClient::getKubeResourceInfo : Got empty response from Kube API for #{resource} @ #{Time.now.utc.iso8601}") - end - return response - end + def getTokenStr + return @@TokenStr if !@@TokenStr.nil? + begin + if File.exist?(@@TokenFileName) && File.readable?(@@TokenFileName) + @@TokenStr = File.read(@@TokenFileName).strip + return @@TokenStr + else + @Log.warn("Unable to read token string from #{@@TokenFileName}: #{error}") + return nil + end + end + end - def getTokenStr - return @@TokenStr if !@@TokenStr.nil? - begin - if File.exist?(@@TokenFileName) && File.readable?(@@TokenFileName) - @@TokenStr = File.read(@@TokenFileName).strip - return @@TokenStr - else - @Log.warn("Unable to read token string from #{@@TokenFileName}: #{error}") - return nil - end - end - end + def getResourceUri(resource) + begin + if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] + return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource + else + @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") + return nil + end + end + end - def getResourceUri(resource) - begin - if ENV['KUBERNETES_SERVICE_HOST'] && ENV['KUBERNETES_PORT_443_TCP_PORT'] - return "https://#{ENV['KUBERNETES_SERVICE_HOST']}:#{ENV['KUBERNETES_PORT_443_TCP_PORT']}/api/" + @@ApiVersion + "/" + resource - else - @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV['KUBERNETES_SERVICE_HOST']} KUBERNETES_PORT_443_TCP_PORT: #{ENV['KUBERNETES_PORT_443_TCP_PORT']}. Unable to form resourceUri") - return nil - end + def getClusterName + return @@ClusterName if !@@ClusterName.nil? + @@ClusterName = "None" + begin + #try getting resource ID for aks + cluster = ENV["AKS_RESOURCE_ID"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterName = cluster.split("/").last + else + cluster = ENV["ACS_RESOURCE_NAME"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterName = cluster + else + kubesystemResourceUri = "namespaces/" + @@KubeSystemNamespace + "/pods" + @Log.info("KubernetesApiClient::getClusterName : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + @Log.info("KubernetesApiClient::getClusterName : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo["items"].each do |items| + if items["metadata"]["name"].include? "kube-controller-manager" + items["spec"]["containers"][0]["command"].each do |command| + if command.include? "--cluster-name" + @@ClusterName = command.split("=")[1] + end end + end end + end + end + rescue => error + @Log.warn("getClusterName failed: #{error}") + end + return @@ClusterName + end - def getClusterName - return @@ClusterName if !@@ClusterName.nil? - @@ClusterName = "None" - begin - #try getting resource ID for aks - cluster = ENV['AKS_RESOURCE_ID'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterName = cluster.split("/").last - else - cluster = ENV['ACS_RESOURCE_NAME'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterName = cluster - else - kubesystemResourceUri = "namespaces/" + @@KubeSystemNamespace + "/pods" - @Log.info("KubernetesApiClient::getClusterName : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - @Log.info("KubernetesApiClient::getClusterName : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo['items'].each do |items| - if items['metadata']['name'].include? "kube-controller-manager" - items['spec']['containers'][0]['command'].each do |command| - if command.include? "--cluster-name" - @@ClusterName = command.split('=')[1] - end - end - end - end - end - end - rescue => error - @Log.warn("getClusterName failed: #{error}") - end - return @@ClusterName - end + def getClusterId + return @@ClusterId if !@@ClusterId.nil? + #By default initialize ClusterId to ClusterName. + # In ACS/On-prem, we need to figure out how we can generate ClusterId + @@ClusterId = getClusterName + begin + cluster = ENV["AKS_RESOURCE_ID"] + if cluster && !cluster.nil? && !cluster.empty? + @@ClusterId = cluster + end + rescue => error + @Log.warn("getClusterId failed: #{error}") + end + return @@ClusterId + end - def getClusterId - return @@ClusterId if !@@ClusterId.nil? - #By default initialize ClusterId to ClusterName. - # In ACS/On-prem, we need to figure out how we can generate ClusterId - @@ClusterId = getClusterName - begin - cluster = ENV['AKS_RESOURCE_ID'] - if cluster && !cluster.nil? && !cluster.empty? - @@ClusterId = cluster - end - rescue => error - @Log.warn("getClusterId failed: #{error}") - end - return @@ClusterId + def isNodeMaster + return @@IsNodeMaster if !@@IsNodeMaster.nil? + @@IsNodeMaster = false + begin + @Log.info("KubernetesApiClient::isNodeMaster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + allNodesInfo = JSON.parse(getKubeResourceInfo("nodes").body) + @Log.info("KubernetesApiClient::isNodeMaster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + if !allNodesInfo.nil? && !allNodesInfo.empty? + thisNodeName = OMS::Common.get_hostname + allNodesInfo["items"].each do |item| + if item["metadata"]["name"].casecmp(thisNodeName) == 0 + if item["metadata"]["labels"]["kubernetes.io/role"].to_s.include?("master") || item["metadata"]["labels"]["role"].to_s.include?("master") + @@IsNodeMaster = true + end + break end + end + end + rescue => error + @Log.warn("KubernetesApiClient::isNodeMaster : node role request failed: #{error}") + end - def isNodeMaster - return @@IsNodeMaster if !@@IsNodeMaster.nil? - @@IsNodeMaster = false - begin - @Log.info("KubernetesApiClient::isNodeMaster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) - @Log.info("KubernetesApiClient::isNodeMaster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - if !allNodesInfo.nil? && !allNodesInfo.empty? - thisNodeName = OMS::Common.get_hostname - allNodesInfo['items'].each do |item| - if item['metadata']['name'].casecmp(thisNodeName) == 0 - if item['metadata']['labels']["kubernetes.io/role"].to_s.include?("master") || item['metadata']['labels']["role"].to_s.include?("master") - @@IsNodeMaster = true - end - break - end - end - end - rescue => error - @Log.warn("KubernetesApiClient::isNodeMaster : node role request failed: #{error}") - end - - return @@IsNodeMaster - end + return @@IsNodeMaster + end - #def isValidRunningNode - # return @@IsValidRunningNode if !@@IsValidRunningNode.nil? - # @@IsValidRunningNode = false - # begin - # thisNodeName = OMS::Common.get_hostname - # if isLinuxCluster - # # Run on agent node [0] - # @@IsValidRunningNode = !isNodeMaster && thisNodeName.to_s.split('-').last == '0' - # else - # # Run on master node [0] - # @@IsValidRunningNode = isNodeMaster && thisNodeName.to_s.split('-').last == '0' - # end - # rescue => error - # @Log.warn("Checking Node Type failed: #{error}") - # end - # if(@@IsValidRunningNode == true) - # @Log.info("Electing current node to talk to k8 api") - # else - # @Log.info("Not Electing current node to talk to k8 api") - # end - # return @@IsValidRunningNode - #end - - #def isLinuxCluster - # return @@IsLinuxCluster if !@@IsLinuxCluster.nil? - # @@IsLinuxCluster = true - # begin - # @Log.info("KubernetesApiClient::isLinuxCluster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) - # @Log.info("KubernetesApiClient::isLinuxCluster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - # if !allNodesInfo.nil? && !allNodesInfo.empty? - # allNodesInfo['items'].each do |item| - # if !(item['status']['nodeInfo']['operatingSystem'].casecmp('linux') == 0) - # @@IsLinuxCluster = false - # break - # end - # end - # end - # rescue => error - # @Log.warn("KubernetesApiClient::isLinuxCluster : node role request failed: #{error}") - # end - # return @@IsLinuxCluster - #end - - # returns an arry of pods (json) - def getPods(namespace) - pods = [] - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods" - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - podInfo['items'].each do |items| - pods.push items - end - rescue => error - @Log.warn("List pods request failed: #{error}") - end - return pods - end + #def isValidRunningNode + # return @@IsValidRunningNode if !@@IsValidRunningNode.nil? + # @@IsValidRunningNode = false + # begin + # thisNodeName = OMS::Common.get_hostname + # if isLinuxCluster + # # Run on agent node [0] + # @@IsValidRunningNode = !isNodeMaster && thisNodeName.to_s.split('-').last == '0' + # else + # # Run on master node [0] + # @@IsValidRunningNode = isNodeMaster && thisNodeName.to_s.split('-').last == '0' + # end + # rescue => error + # @Log.warn("Checking Node Type failed: #{error}") + # end + # if(@@IsValidRunningNode == true) + # @Log.info("Electing current node to talk to k8 api") + # else + # @Log.info("Not Electing current node to talk to k8 api") + # end + # return @@IsValidRunningNode + #end + + #def isLinuxCluster + # return @@IsLinuxCluster if !@@IsLinuxCluster.nil? + # @@IsLinuxCluster = true + # begin + # @Log.info("KubernetesApiClient::isLinuxCluster : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + # allNodesInfo = JSON.parse(getKubeResourceInfo('nodes').body) + # @Log.info("KubernetesApiClient::isLinuxCluster : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + # if !allNodesInfo.nil? && !allNodesInfo.empty? + # allNodesInfo['items'].each do |item| + # if !(item['status']['nodeInfo']['operatingSystem'].casecmp('linux') == 0) + # @@IsLinuxCluster = false + # break + # end + # end + # end + # rescue => error + # @Log.warn("KubernetesApiClient::isLinuxCluster : node role request failed: #{error}") + # end + # return @@IsLinuxCluster + #end + + # returns an arry of pods (json) + def getPods(namespace) + pods = [] + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods" + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + podInfo["items"].each do |items| + pods.push items + end + rescue => error + @Log.warn("List pods request failed: #{error}") + end + return pods + end - def getContainerIDs(namespace) - containers = Hash.new - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods" - @Log.info("KubernetesApiClient::getContainerIDs : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) - @Log.info("KubernetesApiClient::getContainerIDs : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInfo['items'].each do |item| - if (!item['status'].nil? && !item['status'].empty? && !item['status']['containerStatuses'].nil? && !item['status']['containerStatuses'].empty?) - item['status']['containerStatuses'].each do |cntr| - containers[cntr['containerID']] = "kube-system" - end - end - end - rescue => error - @Log.warn("KubernetesApiClient::getContainerIDs : List ContainerIDs request failed: #{error}") + # returns a hash of windows node names and their internal IPs + def getWindowsNodes + winNodes = [] + begin + nodeInventory = JSON.parse(getKubeResourceInfo("nodes").body) + @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" + # Resetting the windows node cache + @@WinNodeArray.clear + if (!nodeInventory.empty?) + nodeInventory["items"].each do |item| + # check for windows operating system in node metadata + winNode = {} + nodeStatus = item["status"] + nodeMetadata = item["metadata"] + if !nodeStatus.nil? && !nodeStatus["nodeInfo"].nil? && !nodeStatus["nodeInfo"]["operatingSystem"].nil? + operatingSystem = nodeStatus["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + # Adding windows nodes to winNodeArray so that it can be used in kubepodinventory to send ContainerInventory data + # to get images and image tags for containers in windows nodes + if !nodeMetadata.nil? && !nodeMetadata["name"].nil? + @@WinNodeArray.push(nodeMetadata["name"]) end - return containers + nodeStatusAddresses = nodeStatus["addresses"] + if !nodeStatusAddresses.nil? + nodeStatusAddresses.each do |address| + winNode[address["type"]] = address["address"] + end + winNodes.push(winNode) + end + end end + end + end + return winNodes + rescue => error + @Log.warn("Error in get windows nodes: #{error}") + return nil + end + end - def getContainerLogs(namespace, pod, container, showTimeStamp) - containerLogs = "" - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container - if showTimeStamp - kubesystemResourceUri += "×tamps=true" - end - @Log.info("KubernetesApiClient::getContainerLogs : Getting logs from Kube API @ #{Time.now.utc.iso8601}") - containerLogs = getKubeResourceInfo(kubesystemResourceUri).body - @Log.info("KubernetesApiClient::getContainerLogs : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") - rescue => error - @Log.warn("Pod logs request failed: #{error}") - end - return containerLogs + def getWindowsNodesArray + return @@WinNodeArray + end + + def getContainerIDs(namespace) + containers = Hash.new + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods" + @Log.info("KubernetesApiClient::getContainerIDs : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo = JSON.parse(getKubeResourceInfo(kubesystemResourceUri).body) + @Log.info("KubernetesApiClient::getContainerIDs : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInfo["items"].each do |item| + if (!item["status"].nil? && !item["status"].empty? && !item["status"]["containerStatuses"].nil? && !item["status"]["containerStatuses"].empty?) + item["status"]["containerStatuses"].each do |cntr| + containers[cntr["containerID"]] = "kube-system" end + end + end + rescue => error + @Log.warn("KubernetesApiClient::getContainerIDs : List ContainerIDs request failed: #{error}") + end + return containers + end + + def getContainerLogs(namespace, pod, container, showTimeStamp) + containerLogs = "" + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + if showTimeStamp + kubesystemResourceUri += "×tamps=true" + end + @Log.info("KubernetesApiClient::getContainerLogs : Getting logs from Kube API @ #{Time.now.utc.iso8601}") + containerLogs = getKubeResourceInfo(kubesystemResourceUri).body + @Log.info("KubernetesApiClient::getContainerLogs : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") + rescue => error + @Log.warn("Pod logs request failed: #{error}") + end + return containerLogs + end + + def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp) + containerLogs = "" + begin + kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + "&sinceTime=" + since + kubesystemResourceUri = URI.escape(kubesystemResourceUri, ":.+") # HTML URL Encoding for date + + if showTimeStamp + kubesystemResourceUri += "×tamps=true" + end + @Log.info("calling #{kubesystemResourceUri}") + @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Getting logs from Kube API @ #{Time.now.utc.iso8601}") + containerLogs = getKubeResourceInfo(kubesystemResourceUri).body + @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") + rescue => error + @Log.warn("Pod logs request failed: #{error}") + end + return containerLogs + end - def getContainerLogsSinceTime(namespace, pod, container, since, showTimeStamp) - containerLogs = "" - begin - kubesystemResourceUri = "namespaces/" + namespace + "/pods/" + pod + "/log" + "?container=" + container + "&sinceTime=" + since - kubesystemResourceUri = URI.escape(kubesystemResourceUri, ":.+") # HTML URL Encoding for date - - if showTimeStamp - kubesystemResourceUri += "×tamps=true" - end - @Log.info("calling #{kubesystemResourceUri}") - @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Getting logs from Kube API @ #{Time.now.utc.iso8601}") - containerLogs = getKubeResourceInfo(kubesystemResourceUri).body - @Log.info("KubernetesApiClient::getContainerLogsSinceTime : Done getting logs from Kube API @ #{Time.now.utc.iso8601}") - rescue => error - @Log.warn("Pod logs request failed: #{error}") + def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + metricItems = [] + begin + clusterId = getClusterId + metricInfo = metricJSON + metricInfo["items"].each do |pod| + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] + else + podUid = pod["metadata"]["uid"] + end + if (!pod["spec"]["containers"].nil? && !pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + pod["spec"]["containers"].each do |container| + containerName = container["name"] + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit + else + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) + metricValue = @@NodeMetrics[nodeMetricsHashKey] + #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) end - return containerLogs + end end + end + end + rescue => error + @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + return metricItems + end + return metricItems + end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) - metricItems = [] - begin - clusterId = getClusterId - metricInfo = metricJSON - metricInfo['items'].each do |pod| - podNameSpace = pod['metadata']['namespace'] - if podNameSpace.eql?("kube-system") && !pod['metadata'].key?("ownerReferences") - # The above case seems to be the only case where you have horizontal scaling of pods - # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash - # instead of the actual poduid. Since this uid is not being surface into the UX - # its ok to use this. - # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = pod['metadata']['annotations']['kubernetes.io/config.hash'] - else - podUid = pod['metadata']['uid'] - end - if (!pod['spec']['containers'].nil? && !pod['spec']['nodeName'].nil?) - nodeName = pod['spec']['nodeName'] - pod['spec']['containers'].each do |container| - containerName = container['name'] - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container['resources'].nil? && !container['resources'].empty? && !container['resources'][metricCategory].nil? && !container['resources'][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container['resources'][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = nodeName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #No container level limit for the given metric, so default to node level limit - else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem['DataItems'] = [] - - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = nodeName - metricProps['ObjectName'] = "K8SContainer" - metricProps['InstanceName'] = clusterId + "/" + podUid + "/" + containerName - - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - end - end - end - end - end - rescue => error - @Log.warn("getcontainerResourceRequestsAndLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") - return metricItems - end - return metricItems - end #getContainerResourceRequestAndLimits - - def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) - metricItems = [] - begin - metricInfo = metricJSON - clusterId = getClusterId - #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, - #if we are coming up with the time it should be same for all nodes - metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricInfo['items'].each do |node| - if (!node['status'][metricCategory].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" - metricValue = getMetricNumericValue(metricNameToCollect, node['status'][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem['DataItems'] = [] - metricProps = {} - metricProps['Timestamp'] = metricTime - metricProps['Host'] = node['metadata']['name'] - metricProps['ObjectName'] = "K8SNode" - metricProps['InstanceName'] = clusterId + "/" + node['metadata']['name'] - metricProps['Collections'] = [] - metricCollections = {} - metricCollections['CounterName'] = metricNametoReturn - metricCollections['Value'] = metricValue - - metricProps['Collections'].push(metricCollections) - metricItem['DataItems'].push(metricProps) - metricItems.push(metricItem) - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node['metadata']['name'] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end - end - rescue => error - @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") - end - return metricItems - end #parseNodeLimits - - def getMetricNumericValue(metricName, metricVal) - metricValue = metricVal - begin - case metricName - when "memory" #convert to bytes for memory - #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ - if (metricValue.end_with?("Ki")) - metricValue.chomp!("Ki") - metricValue = Float(metricValue) * 1024.0 ** 1 - elsif (metricValue.end_with?("Mi")) - metricValue.chomp!("Mi") - metricValue = Float(metricValue) * 1024.0 ** 2 - elsif (metricValue.end_with?("Gi")) - metricValue.chomp!("Gi") - metricValue = Float(metricValue) * 1024.0 ** 3 - elsif (metricValue.end_with?("Ti")) - metricValue.chomp!("Ti") - metricValue = Float(metricValue) * 1024.0 ** 4 - elsif (metricValue.end_with?("Pi")) - metricValue.chomp!("Pi") - metricValue = Float(metricValue) * 1024.0 ** 5 - elsif (metricValue.end_with?("Ei")) - metricValue.chomp!("Ei") - metricValue = Float(metricValue) * 1024.0 ** 6 - elsif (metricValue.end_with?("Zi")) - metricValue.chomp!("Zi") - metricValue = Float(metricValue) * 1024.0 ** 7 - elsif (metricValue.end_with?("Yi")) - metricValue.chomp!("Yi") - metricValue = Float(metricValue) * 1024.0 ** 8 - elsif (metricValue.end_with?("K")) - metricValue.chomp!("K") - metricValue = Float(metricValue) * 1000.0 ** 1 - elsif (metricValue.end_with?("M")) - metricValue.chomp!("M") - metricValue = Float(metricValue) * 1000.0 ** 2 - elsif (metricValue.end_with?("G")) - metricValue.chomp!("G") - metricValue = Float(metricValue) * 1000.0 ** 3 - elsif (metricValue.end_with?("T")) - metricValue.chomp!("T") - metricValue = Float(metricValue) * 1000.0 ** 4 - elsif (metricValue.end_with?("P")) - metricValue.chomp!("P") - metricValue = Float(metricValue) * 1000.0 ** 5 - elsif (metricValue.end_with?("E")) - metricValue.chomp!("E") - metricValue = Float(metricValue) * 1000.0 ** 6 - elsif (metricValue.end_with?("Z")) - metricValue.chomp!("Z") - metricValue = Float(metricValue) * 1000.0 ** 7 - elsif (metricValue.end_with?("Y")) - metricValue.chomp!("Y") - metricValue = Float(metricValue) * 1000.0 ** 8 - else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') - metricValue = Float(metricValue) - end - when "cpu" #convert to nanocores for cpu - #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/ - if (metricValue.end_with?("m")) - metricValue.chomp!("m") - metricValue = Float(metricValue) * 1000.0 ** 2 - else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') - metricValue = Float(metricValue) * 1000.0 ** 3 - end - else - @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") - metricValue = 0 - end #case statement - rescue => error - @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 formetric value") - return 0 - end - return metricValue - end # getMetricNumericValue + def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn) + metricItems = [] + begin + metricInfo = metricJSON + clusterId = getClusterId + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + metricInfo["items"].each do |node| + if (!node["status"][metricCategory].nil?) + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = node["metadata"]["name"] + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #push node level metrics to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end end - end + rescue => error + @Log.warn("parseNodeLimits failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItems + end #parseNodeLimits + def getMetricNumericValue(metricName, metricVal) + metricValue = metricVal.downcase + begin + case metricName + when "memory" #convert to bytes for memory + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-memory-resource/ + if (metricValue.end_with?("ki")) + metricValue.chomp!("ki") + metricValue = Float(metricValue) * 1024.0 ** 1 + elsif (metricValue.end_with?("mi")) + metricValue.chomp!("mi") + metricValue = Float(metricValue) * 1024.0 ** 2 + elsif (metricValue.end_with?("gi")) + metricValue.chomp!("gi") + metricValue = Float(metricValue) * 1024.0 ** 3 + elsif (metricValue.end_with?("ti")) + metricValue.chomp!("ti") + metricValue = Float(metricValue) * 1024.0 ** 4 + elsif (metricValue.end_with?("pi")) + metricValue.chomp!("pi") + metricValue = Float(metricValue) * 1024.0 ** 5 + elsif (metricValue.end_with?("ei")) + metricValue.chomp!("ei") + metricValue = Float(metricValue) * 1024.0 ** 6 + elsif (metricValue.end_with?("zi")) + metricValue.chomp!("zi") + metricValue = Float(metricValue) * 1024.0 ** 7 + elsif (metricValue.end_with?("yi")) + metricValue.chomp!("yi") + metricValue = Float(metricValue) * 1024.0 ** 8 + elsif (metricValue.end_with?("k")) + metricValue.chomp!("k") + metricValue = Float(metricValue) * 1000.0 ** 1 + elsif (metricValue.end_with?("m")) + metricValue.chomp!("m") + metricValue = Float(metricValue) * 1000.0 ** 2 + elsif (metricValue.end_with?("g")) + metricValue.chomp!("g") + metricValue = Float(metricValue) * 1000.0 ** 3 + elsif (metricValue.end_with?("t")) + metricValue.chomp!("t") + metricValue = Float(metricValue) * 1000.0 ** 4 + elsif (metricValue.end_with?("p")) + metricValue.chomp!("p") + metricValue = Float(metricValue) * 1000.0 ** 5 + elsif (metricValue.end_with?("e")) + metricValue.chomp!("e") + metricValue = Float(metricValue) * 1000.0 ** 6 + elsif (metricValue.end_with?("z")) + metricValue.chomp!("z") + metricValue = Float(metricValue) * 1000.0 ** 7 + elsif (metricValue.end_with?("y")) + metricValue.chomp!("y") + metricValue = Float(metricValue) * 1000.0 ** 8 + else #assuming there are no units specified, it is bytes (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) + end + when "cpu" #convert to nanocores for cpu + #https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/ + if (metricValue.end_with?("m")) + metricValue.chomp!("m") + metricValue = Float(metricValue) * 1000.0 ** 2 + else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') + metricValue = Float(metricValue) * 1000.0 ** 3 + end + else + @Log.warn("getMetricNumericValue: Unsupported metric #{metricName}. Returning 0 for metric value") + metricValue = 0 + end #case statement + rescue => error + @Log.warn("getMetricNumericValue failed: #{error} for metric #{metricName} with value #{metricVal}. Returning 0 formetric value") + return 0 + end + return metricValue + end # getMetricNumericValue + end +end diff --git a/source/code/plugin/filter_cadvisor2mdm.rb b/source/code/plugin/filter_cadvisor2mdm.rb index 94f2107cc..a6e643e45 100644 --- a/source/code/plugin/filter_cadvisor2mdm.rb +++ b/source/code/plugin/filter_cadvisor2mdm.rb @@ -10,45 +10,45 @@ module Fluent class CAdvisor2MdmFilter < Filter Fluent::Plugin.register_filter('filter_cadvisor2mdm', self) - + config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log' config_param :custom_metrics_azure_regions, :string config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes' - + @@cpu_usage_milli_cores = 'cpuUsageMillicores' @@cpu_usage_nano_cores = 'cpuusagenanocores' @@object_name_k8s_node = 'K8SNode' @@hostName = (OMS::Common.get_hostname) @@custom_metrics_template = ' - { - "time": "%{timestamp}", - "data": { - "baseData": { - "metric": "%{metricName}", - "namespace": "Insights.Container/nodes", - "dimNames": [ + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "Insights.Container/nodes", + "dimNames": [ "host" - ], - "series": [ - { - "dimValues": [ + ], + "series": [ + { + "dimValues": [ "%{hostvalue}" - ], + ], "min": %{metricminvalue}, - "max": %{metricmaxvalue}, - "sum": %{metricsumvalue}, - "count": 1 - } - ] - } - } + "max": %{metricmaxvalue}, + "sum": %{metricsumvalue}, + "count": 1 + } + ] + } + } }' - + @@metric_name_metric_percentage_name_hash = { - @@cpu_usage_milli_cores => "cpuUsagePercentage", + @@cpu_usage_milli_cores => "cpuUsagePercentage", "memoryRssBytes" => "memoryRssPercentage", - "memoryWorkingSetBytes" => "memoryWorkingSetPercentage" + "memoryWorkingSetBytes" => "memoryWorkingSetPercentage" } @process_incoming_stream = true @@ -61,7 +61,7 @@ def initialize def configure(conf) super @log = nil - + if @enable_log @log = Logger.new(@log_path, 1, 5000000) @log.debug {'Starting filter_cadvisor2mdm plugin'} @@ -70,15 +70,19 @@ def configure(conf) def start super - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) - @metrics_to_collect_hash = build_metrics_hash - @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" - - # initialize cpu and memory limit - if @process_incoming_stream - @cpu_capacity = 0.0 - @memory_capacity = 0.0 - ensure_cpu_memory_capacity_set + begin + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @metrics_to_collect_hash = build_metrics_hash + @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" + + # initialize cpu and memory limit + if @process_incoming_stream + @cpu_capacity = 0.0 + @memory_capacity = 0.0 + ensure_cpu_memory_capacity_set + end + rescue => e + @log.info "Error initializing plugin #{e}" end end @@ -117,9 +121,9 @@ def filter(tag, time, record) if @memory_capacity != 0.0 percentage_metric_value = metric_value*100/@memory_capacity end - end + end return get_metric_records(record, metric_name, metric_value, percentage_metric_value) - else + else return [] end else @@ -140,13 +144,13 @@ def ensure_cpu_memory_capacity_set return end - begin + begin nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes?fieldSelector=metadata.name%3D#{@@hostName}").body) rescue Exception => e @log.info "Error when getting nodeInventory from kube API. Exception: #{e.class} Message: #{e.message} " ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) end - if !nodeInventory.nil? + if !nodeInventory.nil? cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") if !cpu_capacity_json.nil? && !cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'].to_s.nil? @cpu_capacity = cpu_capacity_json[0]['DataItems'][0]['Collections'][0]['Value'] @@ -163,7 +167,7 @@ def ensure_cpu_memory_capacity_set end end end - + def get_metric_records(record, metric_name, metric_value, percentage_metric_value) records = [] custommetricrecord = @@custom_metrics_template % { @@ -194,20 +198,20 @@ def get_metric_records(record, metric_name, metric_value, percentage_metric_valu return records end - + def filter_stream(tag, es) new_es = MultiEventStream.new - ensure_cpu_memory_capacity_set - es.each { |time, record| - begin + begin + ensure_cpu_memory_capacity_set + es.each { |time, record| filtered_records = filter(tag, time, record) - filtered_records.each {|filtered_record| + filtered_records.each {|filtered_record| new_es.add(time, filtered_record) if filtered_record - } if filtered_records - rescue => e - router.emit_error_event(tag, time, record, e) - end - } + } if filtered_records + } + rescue => e + @log.info "Error in filter_stream #{e.message}" + end new_es end end diff --git a/source/code/plugin/in_cadvisor_perf.rb b/source/code/plugin/in_cadvisor_perf.rb index a857aa6b9..f5f65f01b 100644 --- a/source/code/plugin/in_cadvisor_perf.rb +++ b/source/code/plugin/in_cadvisor_perf.rb @@ -2,90 +2,88 @@ # frozen_string_literal: true module Fluent - - class CAdvisor_Perf_Input < Input - Plugin.register_input('cadvisorperf', self) - - def initialize - super - require 'yaml' - require 'json' - - require_relative 'CAdvisorMetricsAPIClient' - require_relative 'oms_common' - require_relative 'omslog' - end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.api.cadvisorperf" - config_param :mdmtag, :string, :default => "mdm.cadvisorperf" - - def configure (conf) - super + class CAdvisor_Perf_Input < Input + Plugin.register_input("cadvisorperf", self) + + def initialize + super + require "yaml" + require "json" + + require_relative "CAdvisorMetricsAPIClient" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.api.cadvisorperf" + config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join + end + + def enumerate() + time = Time.now.to_f + begin + eventStream = MultiEventStream.new + metricData = CAdvisorMetricsAPIClient.getMetrics() + metricData.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + eventStream.add(time, record) if record + #router.emit(@tag, time, record) if record end - end - - def enumerate() - time = Time.now.to_f - begin - eventStream = MultiEventStream.new - metricData = CAdvisorMetricsAPIClient.getMetrics() - metricData.each do |record| - record['DataType'] = "LINUX_PERF_BLOB" - record['IPName'] = "LogManagement" - eventStream.add(time, record) if record - #router.emit(@tag, time, record) if record - end - - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@mdmtag, eventStream) if eventStream - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => errorStr - $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) + + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + rescue => errorStr + $log.warn "Failed to retrieve cadvisor metric data: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics: #{errorStr}" - end + @mutex.unlock + if !done + begin + $log.info("in_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics: #{errorStr}" end - @mutex.lock end - @mutex.unlock + @mutex.lock end - end # CAdvisor_Perf_Input + @mutex.unlock + end + end # CAdvisor_Perf_Input end # module - diff --git a/source/code/plugin/in_containerinventory.rb b/source/code/plugin/in_containerinventory.rb index f501421a2..4d83278a9 100644 --- a/source/code/plugin/in_containerinventory.rb +++ b/source/code/plugin/in_containerinventory.rb @@ -2,29 +2,28 @@ # frozen_string_literal: true module Fluent - class Container_Inventory_Input < Input - Plugin.register_input('containerinventory', self) + Plugin.register_input("containerinventory", self) - @@PluginName = 'ContainerInventory' - @@RunningState = 'Running' - @@FailedState = 'Failed' - @@StoppedState = 'Stopped' - @@PausedState = 'Paused' + @@PluginName = "ContainerInventory" + @@RunningState = "Running" + @@FailedState = "Failed" + @@StoppedState = "Stopped" + @@PausedState = "Paused" def initialize super - require 'json' - require_relative 'DockerApiClient' - require_relative 'ContainerInventoryState' - require_relative 'ApplicationInsightsUtility' - require_relative 'omslog' + require "json" + require_relative "DockerApiClient" + require_relative "ContainerInventoryState" + require_relative "ApplicationInsightsUtility" + require_relative "omslog" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.containerinventory" - - def configure (conf) + + def configure(conf) super end @@ -50,16 +49,16 @@ def shutdown def obtainContainerConfig(instance, container) begin - configValue = container['Config'] + configValue = container["Config"] if !configValue.nil? - instance['ContainerHostname'] = configValue['Hostname'] + instance["ContainerHostname"] = configValue["Hostname"] - envValue = configValue['Env'] + envValue = configValue["Env"] envValueString = (envValue.nil?) ? "" : envValue.to_s # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) envValueString = ["AZMON_COLLECT_ENV=FALSE"] - $log.warn("Environment Variable collection for container: #{container['Id']} skipped because AZMON_COLLECT_ENV is set to false") + $log.warn("Environment Variable collection for container: #{container["Id"]} skipped because AZMON_COLLECT_ENV is set to false") end # Restricting the ENV string value to 200kb since the size of this string can go very high if envValueString.length > 200000 @@ -68,88 +67,88 @@ def obtainContainerConfig(instance, container) if !lastIndex.nil? envValueStringTruncated = envValueStringTruncated.slice(0..lastIndex) + "]" end - instance['EnvironmentVar'] = envValueStringTruncated + instance["EnvironmentVar"] = envValueStringTruncated else - instance['EnvironmentVar'] = envValueString + instance["EnvironmentVar"] = envValueString end - cmdValue = configValue['Cmd'] + cmdValue = configValue["Cmd"] cmdValueString = (cmdValue.nil?) ? "" : cmdValue.to_s - instance['Command'] = cmdValueString + instance["Command"] = cmdValueString - instance['ComposeGroup'] = "" - labelsValue = configValue['Labels'] + instance["ComposeGroup"] = "" + labelsValue = configValue["Labels"] if !labelsValue.nil? && !labelsValue.empty? - instance['ComposeGroup'] = labelsValue['com.docker.compose.project'] + instance["ComposeGroup"] = labelsValue["com.docker.compose.project"] end else - $log.warn("Attempt in ObtainContainerConfig to get container: #{container['Id']} config information returned null") - end - rescue => errorStr - $log.warn("Exception in obtainContainerConfig: #{errorStr}") + $log.warn("Attempt in ObtainContainerConfig to get container: #{container["Id"]} config information returned null") end + rescue => errorStr + $log.warn("Exception in obtainContainerConfig: #{errorStr}") + end end def obtainContainerState(instance, container) begin - stateValue = container['State'] + stateValue = container["State"] if !stateValue.nil? - exitCodeValue = stateValue['ExitCode'] + exitCodeValue = stateValue["ExitCode"] # Exit codes less than 0 are not supported by the engine if exitCodeValue < 0 - exitCodeValue = 128 - $log.info("obtainContainerState::Container: #{container['Id']} returned negative exit code") + exitCodeValue = 128 + $log.info("obtainContainerState::Container: #{container["Id"]} returned negative exit code") end - instance['ExitCode'] = exitCodeValue + instance["ExitCode"] = exitCodeValue if exitCodeValue > 0 - instance['State'] = @@FailedState + instance["State"] = @@FailedState else # Set the Container status : Running/Paused/Stopped - runningValue = stateValue['Running'] + runningValue = stateValue["Running"] if runningValue - pausedValue = stateValue['Paused'] + pausedValue = stateValue["Paused"] # Checking for paused within running is true state because docker returns true for both Running and Paused fields when the container is paused if pausedValue - instance['State'] = @@PausedState + instance["State"] = @@PausedState else - instance['State'] = @@RunningState + instance["State"] = @@RunningState end else - instance['State'] = @@StoppedState + instance["State"] = @@StoppedState end end - instance['StartedTime'] = stateValue['StartedAt'] - instance['FinishedTime'] = stateValue['FinishedAt'] + instance["StartedTime"] = stateValue["StartedAt"] + instance["FinishedTime"] = stateValue["FinishedAt"] else - $log.info("Attempt in ObtainContainerState to get container: #{container['Id']} state information returned null") + $log.info("Attempt in ObtainContainerState to get container: #{container["Id"]} state information returned null") end - rescue => errorStr - $log.warn("Exception in obtainContainerState: #{errorStr}") + rescue => errorStr + $log.warn("Exception in obtainContainerState: #{errorStr}") end end def obtainContainerHostConfig(instance, container) begin - hostConfig = container['HostConfig'] + hostConfig = container["HostConfig"] if !hostConfig.nil? - links = hostConfig['Links'] - instance['Links'] = "" + links = hostConfig["Links"] + instance["Links"] = "" if !links.nil? linksString = links.to_s - instance['Links'] = (linksString == "null")? "" : linksString + instance["Links"] = (linksString == "null") ? "" : linksString end - portBindings = hostConfig['PortBindings'] - instance['Ports'] = "" + portBindings = hostConfig["PortBindings"] + instance["Ports"] = "" if !portBindings.nil? portBindingsString = portBindings.to_s - instance['Ports'] = (portBindingsString == "null")? "" : portBindingsString + instance["Ports"] = (portBindingsString == "null") ? "" : portBindingsString end else - $log.info("Attempt in ObtainContainerHostConfig to get container: #{container['Id']} host config information returned null") - end - rescue => errorStr - $log.warn("Exception in obtainContainerHostConfig: #{errorStr}") + $log.info("Attempt in ObtainContainerHostConfig to get container: #{container["Id"]} host config information returned null") end + rescue => errorStr + $log.warn("Exception in obtainContainerHostConfig: #{errorStr}") + end end def inspectContainer(id, nameMap) @@ -157,29 +156,29 @@ def inspectContainer(id, nameMap) begin container = DockerApiClient.dockerInspectContainer(id) if !container.nil? && !container.empty? - containerInstance['InstanceID'] = container['Id'] - containerInstance['CreatedTime'] = container['Created'] - containerName = container['Name'] + containerInstance["InstanceID"] = container["Id"] + containerInstance["CreatedTime"] = container["Created"] + containerName = container["Name"] if !containerName.nil? && !containerName.empty? # Remove the leading / from the name if it exists (this is an API issue) - containerInstance['ElementName'] = (containerName[0] == '/') ? containerName[1..-1] : containerName + containerInstance["ElementName"] = (containerName[0] == "/") ? containerName[1..-1] : containerName end - imageValue = container['Image'] + imageValue = container["Image"] if !imageValue.nil? && !imageValue.empty? - containerInstance['ImageId'] = imageValue + containerInstance["ImageId"] = imageValue repoImageTagArray = nameMap[imageValue] if nameMap.has_key? imageValue - containerInstance['Repository'] = repoImageTagArray[0] - containerInstance['Image'] = repoImageTagArray[1] - containerInstance['ImageTag'] = repoImageTagArray[2] + containerInstance["Repository"] = repoImageTagArray[0] + containerInstance["Image"] = repoImageTagArray[1] + containerInstance["ImageTag"] = repoImageTagArray[2] end end - obtainContainerConfig(containerInstance, container); - obtainContainerState(containerInstance, container); - obtainContainerHostConfig(containerInstance, container); + obtainContainerConfig(containerInstance, container) + obtainContainerState(containerInstance, container) + obtainContainerHostConfig(containerInstance, container) end rescue => errorStr - $log.warn("Exception in inspectContainer: #{errorStr} for container: #{id}") + $log.warn("Exception in inspectContainer: #{errorStr} for container: #{id}") end return containerInstance end @@ -199,8 +198,8 @@ def enumerate containerIds.each do |containerId| inspectedContainer = {} inspectedContainer = inspectContainer(containerId, nameMap) - inspectedContainer['Computer'] = hostname - inspectedContainer['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated + inspectedContainer["Computer"] = hostname + inspectedContainer["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated containerInventory.push inspectedContainer ContainerInventoryState.writeContainerState(inspectedContainer) end @@ -210,8 +209,8 @@ def enumerate deletedContainers.each do |deletedContainer| container = ContainerInventoryState.readContainerState(deletedContainer) if !container.nil? - container.each{|k,v| container[k]=v} - container['State'] = "Deleted" + container.each { |k, v| container[k] = v } + container["State"] = "Deleted" containerInventory.push container end end @@ -219,28 +218,28 @@ def enumerate containerInventory.each do |record| wrapper = { - "DataType"=>"CONTAINER_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper end router.emit_stream(@tag, eventStream) if eventStream - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("containerInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - @@telemetryTimeTracker = DateTime.now.to_time.to_i - telemetryProperties = {} - telemetryProperties['Computer'] = hostname - telemetryProperties['ContainerCount'] = containerInventory.length - ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) - end $log.info("in_container_inventory::enumerate : Processing complete - emitted stream @ #{Time.now.utc.iso8601}") end + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + @@telemetryTimeTracker = DateTime.now.to_time.to_i + telemetryProperties = {} + telemetryProperties["Computer"] = hostname + telemetryProperties["ContainerCount"] = containerInventory.length + ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) + end rescue => errorStr $log.warn("Exception in enumerate container inventory: #{errorStr}") end @@ -265,7 +264,5 @@ def run_periodic end @mutex.unlock end - end # Container_Inventory_Input - -end # module \ No newline at end of file +end # module diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index ba1dacbe0..aabda441e 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -2,181 +2,176 @@ # frozen_string_literal: true module Fluent + class Kube_nodeInventory_Input < Input + Plugin.register_input("kubenodeinventory", self) - class Kube_nodeInventory_Input < Input - Plugin.register_input('kubenodeinventory', self) - - @@ContainerNodeInventoryTag = 'oms.containerinsights.ContainerNodeInventory' - @@MDMKubeNodeInventoryTag = 'mdm.kubenodeinventory' + @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" + @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" - def initialize - super - require 'yaml' - require 'json' - - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - require_relative 'oms_common' - require_relative 'omslog' + def initialize + super + require "yaml" + require "json" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - - config_param :run_interval, :time, :default => '1m' - config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" - - def configure (conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i - end - end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end - end - - def enumerate - currentTime = Time.now - emitTime = currentTime.to_f - batchTime = currentTime.utc.iso8601 - telemetrySent = false - $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") - nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('nodes').body) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") - begin - if(!nodeInventory.empty?) - eventStream = MultiEventStream.new - containerNodeInventoryEventStream = MultiEventStream.new - #get node inventory - nodeInventory['items'].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord['Computer'] = items['metadata']['name'] + end - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['Computer'] = items['metadata']['name'] - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ClusterId'] = KubernetesApiClient.getClusterId - record['CreationTimeStamp'] = items['metadata']['creationTimestamp'] - record['Labels'] = [items['metadata']['labels']] - record['Status'] = "" + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. - - if items['status'].key?("conditions") && !items['status']['conditions'].empty? - allNodeConditions="" - items['status']['conditions'].each do |condition| - if condition['status'] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition['type'] - else - allNodeConditions = condition['type'] - end - end - #collect last transition to/from ready (no matter ready is true/false) - if condition['type'] == "Ready" && !condition['lastTransitionTime'].nil? - record['LastTransitionTimeReady'] = condition['lastTransitionTime'] - end - end - if !allNodeConditions.empty? - record['Status'] = allNodeConditions - end + def enumerate + currentTime = Time.now + emitTime = currentTime.to_f + batchTime = currentTime.utc.iso8601 + telemetrySent = false + $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodeInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("nodes").body) + $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + begin + if (!nodeInventory.empty?) + eventStream = MultiEventStream.new + containerNodeInventoryEventStream = MultiEventStream.new + #get node inventory + nodeInventory["items"].each do |items| + record = {} + # Sending records for ContainerNodeInventory + containerNodeInventoryRecord = {} + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - end + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = items["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] + record["Labels"] = [items["metadata"]["labels"]] + record["Status"] = "" - nodeInfo = items['status']['nodeInfo'] - record['KubeletVersion'] = nodeInfo['kubeletVersion'] - record['KubeProxyVersion'] = nodeInfo['kubeProxyVersion'] - containerNodeInventoryRecord['OperatingSystem'] = nodeInfo['osImage'] - dockerVersion = nodeInfo['containerRuntimeVersion'] - dockerVersion.slice! "docker://" - containerNodeInventoryRecord['DockerVersion'] = dockerVersion - # ContainerNodeInventory data for docker version and operating system. - containerNodeInventoryWrapper = { - "DataType"=>"CONTAINER_NODE_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[containerNodeInventoryRecord.each{|k,v| containerNodeInventoryRecord[k]=v}] - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. - wrapper = { - "DataType"=>"KUBE_NODE_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] - } - eventStream.add(emitTime, wrapper) if wrapper - # Adding telemetry to send node telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - capacityInfo = items['status']['capacity'] - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"] , properties) - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"] , properties) - telemetrySent = true - end - end - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - if telemetrySent == true - @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + if items["status"].key?("conditions") && !items["status"]["conditions"].empty? + allNodeConditions = "" + items["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end end - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] end - end - rescue => errorStr - $log.warn "Failed to retrieve node inventory: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - end - - def run_periodic - @mutex.lock - done = @finished - until done - @condition.wait(@mutex, @run_interval) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kube_nodes::run_periodic @ #{Time.now.utc.iso8601}") - enumerate - rescue => errorStr - $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + if !allNodeConditions.empty? + record["Status"] = allNodeConditions + end end + + nodeInfo = items["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + dockerVersion = nodeInfo["containerRuntimeVersion"] + dockerVersion.slice! "docker://" + containerNodeInventoryRecord["DockerVersion"] = dockerVersion + # ContainerNodeInventory data for docker version and operating system. + containerNodeInventoryWrapper = { + "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], + } + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + # Adding telemetry to send node telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + properties = {} + properties["Computer"] = record["Computer"] + properties["KubeletVersion"] = record["KubeletVersion"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] + properties["DockerVersion"] = dockerVersion + capacityInfo = items["status"]["capacity"] + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + telemetrySent = true + end + end + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + if telemetrySent == true + @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + end + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - @mutex.lock end + rescue => errorStr + $log.warn "Failed to retrieve node inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished @mutex.unlock + if !done + begin + $log.info("in_kube_nodes::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock end - - end # Kube_Node_Input - - end # module - - \ No newline at end of file + @mutex.unlock + end + end # Kube_Node_Input +end # module diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 3d026b05f..65573673c 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -2,29 +2,28 @@ # frozen_string_literal: true module Fluent - class Kube_PodInventory_Input < Input - Plugin.register_input('kubepodinventory', self) + Plugin.register_input("kubepodinventory", self) - @@MDMKubePodInventoryTag = 'mdm.kubepodinventory' + @@MDMKubePodInventoryTag = "mdm.kubepodinventory" + @@hostName = (OMS::Common.get_hostname) def initialize super - require 'yaml' - require 'json' - require 'set' - - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' - require_relative 'oms_common' - require_relative 'omslog' + require "yaml" + require "json" + require "set" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" end - config_param :run_interval, :time, :default => '1m' + config_param :run_interval, :time, :default => "1m" config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" - def configure (conf) + def configure(conf) super end @@ -48,29 +47,126 @@ def shutdown end end - def enumerate(podList = nil) - if podList.nil? - $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") - podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo('pods').body) - $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + def enumerate(podList = nil) + if podList.nil? + $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") + podInventory = JSON.parse(KubernetesApiClient.getKubeResourceInfo("pods").body) + $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + else + podInventory = podList + end + begin + if (!podInventory.empty? && podInventory.key?("items") && !podInventory["items"].empty?) + #get pod inventory & services + $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") + serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo("services").body) + $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, serviceList) else - podInventory = podList + $log.warn "Received empty podInventory" + end + rescue => errorStr + $log.warn "Failed in enumerate pod inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def populateWindowsContainerInventoryRecord(container, record, containerEnvVariableHash, batchTime) + begin + containerInventoryRecord = {} + containerName = container["name"] + containerInventoryRecord["InstanceID"] = record["ContainerID"] + containerInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerInventoryRecord["Computer"] = record["Computer"] + containerInventoryRecord["ContainerHostname"] = record["Computer"] + containerInventoryRecord["ElementName"] = containerName + image = container["image"] + repoInfo = image.split("/") + if !repoInfo.nil? + containerInventoryRecord["Repository"] = repoInfo[0] + if !repoInfo[1].nil? + imageInfo = repoInfo[1].split(":") + if !imageInfo.nil? + containerInventoryRecord["Image"] = imageInfo[0] + containerInventoryRecord["ImageTag"] = imageInfo[1] + end + end + end + imageIdInfo = container["imageID"] + imageIdSplitInfo = imageIdInfo.split("@") + if !imageIdSplitInfo.nil? + containerInventoryRecord["ImageId"] = imageIdSplitInfo[1] + end + # Get container state + containerStatus = container["state"] + if containerStatus.keys[0] == "running" + containerInventoryRecord["State"] = "Running" + containerInventoryRecord["StartedTime"] = container["state"]["running"]["startedAt"] + elsif containerStatus.keys[0] == "terminated" + containerExitCode = container["state"]["terminated"]["exitCode"] + containerStartTime = container["state"]["terminated"]["startedAt"] + containerFinishTime = container["state"]["terminated"]["finishedAt"] + if containerExitCode < 0 + # Exit codes less than 0 are not supported by the engine + containerExitCode = 128 + end + if containerExitCode > 0 + containerInventoryRecord["State"] = "Failed" + else + containerInventoryRecord["State"] = "Stopped" + end + containerInventoryRecord["ExitCode"] = containerExitCode + containerInventoryRecord["StartedTime"] = containerStartTime + containerInventoryRecord["FinishedTime"] = containerFinishTime + elsif containerStatus.keys[0] == "waiting" + containerInventoryRecord["State"] = "Waiting" + end + if !containerEnvVariableHash.nil? && !containerEnvVariableHash.empty? + containerInventoryRecord["EnvironmentVar"] = containerEnvVariableHash[containerName] end - begin - if(!podInventory.empty? && podInventory.key?("items") && !podInventory['items'].empty?) - #get pod inventory & services - $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") - serviceList = JSON.parse(KubernetesApiClient.getKubeResourceInfo('services').body) - $log.info("in_kube_podinventory::enumerate : Done getting services from Kube API @ #{Time.now.utc.iso8601}") - parse_and_emit_records(podInventory, serviceList) - else - $log.warn "Received empty podInventory" - end - rescue => errorStr - $log.warn "Failed in enumerate pod inventory: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + return containerInventoryRecord + rescue => errorStr + $log.warn "Failed in populateWindowsContainerInventoryRecord: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def getContainerEnvironmentVariables(pod) + begin + podSpec = pod["spec"] + containerEnvHash = {} + if !podSpec.nil? && !podSpec["containers"].nil? + podSpec["containers"].each do |container| + envVarsArray = [] + containerEnvArray = container["env"] + # Parsing the environment variable array of hashes to a string value + # since that is format being sent by container inventory workflow in daemonset + # Keeping it in the same format because the workflow expects it in this format + # and the UX expects an array of string for environment variables + if !containerEnvArray.nil? && !containerEnvArray.empty? + containerEnvArray.each do |envVarHash| + envName = envVarHash["name"] + envValue = envVarHash["value"] + envArrayElement = envName + "=" + envValue + envVarsArray.push(envArrayElement) + end + end + # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE + envValueString = envVarsArray.to_s + if /AZMON_COLLECT_ENV=FALSE/i.match(envValueString) + envValueString = ["AZMON_COLLECT_ENV=FALSE"] + end + containerEnvHash[container["name"]] = envValueString + end + end + return containerEnvHash + rescue => errorStr + $log.warn "Failed in getContainerEnvironmentVariables: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end end def parse_and_emit_records(podInventory, serviceList) @@ -80,100 +176,116 @@ def parse_and_emit_records(podInventory, serviceList) eventStream = MultiEventStream.new controllerSet = Set.new [] telemetryFlush = false + winContainerCount = 0 begin #begin block start - podInventory['items'].each do |items| #podInventory block start + # Getting windows nodes from kubeapi + winNodes = KubernetesApiClient.getWindowsNodesArray + + podInventory["items"].each do |items| #podInventory block start + sendWindowsContainerInventoryRecord = false + containerInventoryRecords = [] records = [] record = {} - record['CollectionTime'] = batchTime #This is the time that is mapped to become TimeGenerated - record['Name'] = items['metadata']['name'] - podNameSpace = items['metadata']['namespace'] - - if podNameSpace.eql?("kube-system") && !items['metadata'].key?("ownerReferences") + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Name"] = items["metadata"]["name"] + podNameSpace = items["metadata"]["namespace"] + + if podNameSpace.eql?("kube-system") && !items["metadata"].key?("ownerReferences") # The above case seems to be the only case where you have horizontal scaling of pods # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash # instead of the actual poduid. Since this uid is not being surface into the UX # its ok to use this. # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - podUid = items['metadata']['annotations']['kubernetes.io/config.hash'] + podUid = items["metadata"]["annotations"]["kubernetes.io/config.hash"] else - podUid = items['metadata']['uid'] + podUid = items["metadata"]["uid"] end - record['PodUid'] = podUid - record['PodLabel'] = [items['metadata']['labels']] - record['Namespace'] = podNameSpace - record['PodCreationTimeStamp'] = items['metadata']['creationTimestamp'] + record["PodUid"] = podUid + record["PodLabel"] = [items["metadata"]["labels"]] + record["Namespace"] = podNameSpace + record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] #for unscheduled (non-started) pods startTime does NOT exist - if !items['status']['startTime'].nil? - record['PodStartTime'] = items['status']['startTime'] + if !items["status"]["startTime"].nil? + record["PodStartTime"] = items["status"]["startTime"] else - record['PodStartTime'] = "" + record["PodStartTime"] = "" end #podStatus # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running podReadyCondition = true - if !items['status']['reason'].nil? && items['status']['reason'] == "NodeLost" && !items['status']['conditions'].nil? - items['status']['conditions'].each do |condition| - if condition['type'] == "Ready" && condition['status'] == "False" + if !items["status"]["reason"].nil? && items["status"]["reason"] == "NodeLost" && !items["status"]["conditions"].nil? + items["status"]["conditions"].each do |condition| + if condition["type"] == "Ready" && condition["status"] == "False" podReadyCondition = false break end end end if podReadyCondition == false - record['PodStatus'] = "Unknown" + record["PodStatus"] = "Unknown" else - record['PodStatus'] = items['status']['phase'] + record["PodStatus"] = items["status"]["phase"] end #for unscheduled (non-started) pods podIP does NOT exist - if !items['status']['podIP'].nil? - record['PodIp'] =items['status']['podIP'] + if !items["status"]["podIP"].nil? + record["PodIp"] = items["status"]["podIP"] else - record['PodIp'] = "" + record["PodIp"] = "" end #for unscheduled (non-started) pods nodeName does NOT exist - if !items['spec']['nodeName'].nil? - record['Computer'] = items['spec']['nodeName'] + if !items["spec"]["nodeName"].nil? + record["Computer"] = items["spec"]["nodeName"] else - record['Computer'] = "" - end - record['ClusterId'] = KubernetesApiClient.getClusterId - record['ClusterName'] = KubernetesApiClient.getClusterName - record['ServiceName'] = getServiceNameFromLabels(items['metadata']['namespace'], items['metadata']['labels'], serviceList) - # Adding telemetry to send pod telemetry every 5 minutes - timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference/60 - if (timeDifferenceInMinutes >= 5) - telemetryFlush = true - end - if !items['metadata']['ownerReferences'].nil? - record['ControllerKind'] = items['metadata']['ownerReferences'][0]['kind'] - record['ControllerName'] = items['metadata']['ownerReferences'][0]['name'] + record["Computer"] = "" + end + + # Setting this flag to true so that we can send ContainerInventory records for containers + # on windows nodes and parse environment variables for these containers + if winNodes.length > 0 + if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + sendWindowsContainerInventoryRecord = true + containerEnvVariableHash = getContainerEnvironmentVariables(items) + end + end + + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) + # Adding telemetry to send pod telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + telemetryFlush = true + end + if !items["metadata"]["ownerReferences"].nil? + record["ControllerKind"] = items["metadata"]["ownerReferences"][0]["kind"] + record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] if telemetryFlush == true - controllerSet.add(record['ControllerKind'] + record['ControllerName']) + controllerSet.add(record["ControllerKind"] + record["ControllerName"]) end end podRestartCount = 0 - record['PodRestartCount'] = 0 - if items['status'].key?("containerStatuses") && !items['status']['containerStatuses'].empty? #container status block start - items['status']['containerStatuses'].each do |container| - containerRestartCount = 0 - #container Id is of the form - #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 - if !container['containerID'].nil? - record['ContainerID'] = container['containerID'].split("//")[1] - else + record["PodRestartCount"] = 0 + if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start + items["status"]["containerStatuses"].each do |container| + containerRestartCount = 0 + #container Id is of the form + #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 + if !container["containerID"].nil? + record["ContainerID"] = container["containerID"].split("//")[1] + else # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 - record['ContainerID'] = "" + record["ContainerID"] = "" end - #keeping this as which is same as InstanceName in perf table - record['ContainerName'] = podUid + "/" +container['name'] - #Pod restart count is a sumtotal of restart counts of individual containers - #within the pod. The restart count of a container is maintained by kubernetes - #itself in the form of a container label. - containerRestartCount = container['restartCount'] - record['ContainerRestartCount'] = containerRestartCount - containerStatus = container['state'] - record['ContainerStatusReason'] = '' + #keeping this as which is same as InstanceName in perf table + record["ContainerName"] = podUid + "/" + container["name"] + #Pod restart count is a sumtotal of restart counts of individual containers + #within the pod. The restart count of a container is maintained by kubernetes + #itself in the form of a container label. + containerRestartCount = container["restartCount"] + record["ContainerRestartCount"] = containerRestartCount + containerStatus = container["state"] + record["ContainerStatusReason"] = "" # state is of the following form , so just picking up the first key name # "state": { # "waiting": { @@ -183,55 +295,80 @@ def parse_and_emit_records(podInventory, serviceList) # }, # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running if podReadyCondition == false - record['ContainerStatus'] = "Unknown" + record["ContainerStatus"] = "Unknown" else - record['ContainerStatus'] = containerStatus.keys[0] + record["ContainerStatus"] = containerStatus.keys[0] end #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric #Picking up both container and node start time from cAdvisor to be consistent if containerStatus.keys[0] == "running" - record['ContainerCreationTimeStamp'] = container['state']['running']['startedAt'] + record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] else - if !containerStatus[containerStatus.keys[0]]['reason'].nil? && !containerStatus[containerStatus.keys[0]]['reason'].empty? - record['ContainerStatusReason'] = containerStatus[containerStatus.keys[0]]['reason'] + if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? + record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] end end - podRestartCount += containerRestartCount - records.push(record.dup) - end + podRestartCount += containerRestartCount + records.push(record.dup) + + #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel + if sendWindowsContainerInventoryRecord == true + containerInventoryRecord = populateWindowsContainerInventoryRecord(container, record, containerEnvVariableHash, batchTime) + containerInventoryRecords.push(containerInventoryRecord) + end + end else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod - records.push(record) + records.push(record) end #container status block end records.each do |record| if !record.nil? - record['PodRestartCount'] = podRestartCount + record["PodRestartCount"] = podRestartCount wrapper = { - "DataType"=>"KUBE_POD_INVENTORY_BLOB", - "IPName"=>"ContainerInsights", - "DataItems"=>[record.each{|k,v| record[k]=v}] + "DataType" => "KUBE_POD_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], } eventStream.add(emitTime, wrapper) if wrapper - end - end + end + end + # Send container inventory records for containers on windows nodes + winContainerCount += containerInventoryRecords.length + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + ciwrapper = { + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], + } + eventStream.add(emitTime, ciwrapper) if ciwrapper + end + end end #podInventory block end + router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@@MDMKubePodInventoryTag, eventStream) if eventStream if telemetryFlush == true - ApplicationInsightsUtility.sendHeartBeatEvent("KubePodInventory") - ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory['items'].length , {}) - ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length , {}) + telemetryProperties = {} + telemetryProperties["Computer"] = @@hostName + ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, {}) + if winContainerCount > 0 + telemetryProperties["ClusterWideWindowsContainersCount"] = winContainerCount + ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) + end @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end - @@istestvar = ENV['ISTEST'] - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp('true') == 0 && eventStream.count > 0) + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - rescue => errorStr + rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end #begin block end - end + end #begin block end + end def run_periodic @mutex.lock @@ -257,37 +394,33 @@ def run_periodic def getServiceNameFromLabels(namespace, labels, serviceList) serviceName = "" begin - if !labels.nil? && !labels.empty? - if( !serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList['items'].empty?) - serviceList['items'].each do |item| + if !labels.nil? && !labels.empty? + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].empty?) + serviceList["items"].each do |item| found = 0 - if !item['spec'].nil? && !item['spec']['selector'].nil? && item['metadata']['namespace'] == namespace - selectorLabels = item['spec']['selector'] + if !item["spec"].nil? && !item["spec"]["selector"].nil? && item["metadata"]["namespace"] == namespace + selectorLabels = item["spec"]["selector"] if !selectorLabels.empty? - selectorLabels.each do |key,value| - if !(labels.select {|k,v| k==key && v==value}.length > 0) + selectorLabels.each do |key, value| + if !(labels.select { |k, v| k == key && v == value }.length > 0) break end found = found + 1 end - end + end if found == selectorLabels.length - return item['metadata']['name'] + return item["metadata"]["name"] end - end + end end - end + end end - rescue => errorStr + rescue => errorStr $log.warn "Failed to retrieve service name from labels: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName end - end # Kube_Pod_Input - end # module - - diff --git a/source/code/plugin/in_win_cadvisor_perf.rb b/source/code/plugin/in_win_cadvisor_perf.rb new file mode 100644 index 000000000..2e5f839e6 --- /dev/null +++ b/source/code/plugin/in_win_cadvisor_perf.rb @@ -0,0 +1,120 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +module Fluent + class Win_CAdvisor_Perf_Input < Input + Plugin.register_input("wincadvisorperf", self) + + @@winNodes = [] + + def initialize + super + require "yaml" + require "json" + + require_relative "CAdvisorMetricsAPIClient" + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + end + + config_param :run_interval, :time, :default => "1m" + config_param :tag, :string, :default => "oms.api.wincadvisorperf" + config_param :mdmtag, :string, :default => "mdm.cadvisorperf" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate() + time = Time.now.to_f + begin + eventStream = MultiEventStream.new + timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + + #Resetting this cache so that it is populated with the current set of containers with every call + CAdvisorMetricsAPIClient.resetWinContainerIdCache() + if (timeDifferenceInMinutes >= 5) + $log.info "in_win_cadvisor_perf: Getting windows nodes" + nodes = KubernetesApiClient.getWindowsNodes() + if !nodes.nil? + @@winNodes = KubernetesApiClient.getWindowsNodes() + end + $log.info "in_win_cadvisor_perf : Successuly got windows nodes after 5 minute interval" + @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i + end + @@winNodes.each do |winNode| + metricData = CAdvisorMetricsAPIClient.getMetrics(winNode) + metricData.each do |record| + if !record.empty? + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + eventStream.add(time, record) if record + end + end + router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@mdmtag, eventStream) if eventStream + + @@istestvar = ENV["ISTEST"] + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) + $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + end + + # Cleanup routine to clear deleted containers from cache + cleanupTimeDifference = (DateTime.now.to_time.to_i - @@cleanupRoutineTimeTracker).abs + cleanupTimeDifferenceInMinutes = cleanupTimeDifference / 60 + if (cleanupTimeDifferenceInMinutes >= 5) + $log.info "in_win_cadvisor_perf : Cleanup routine kicking in to clear deleted containers from cache" + CAdvisorMetricsAPIClient.clearDeletedWinContainersFromCache() + @@cleanupRoutineTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + $log.warn "Failed to retrieve cadvisor metric data for windows nodes: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + end + end + + def run_periodic + @mutex.lock + done = @finished + until done + @condition.wait(@mutex, @run_interval) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_win_cadvisor_perf::run_periodic @ #{Time.now.utc.iso8601}") + enumerate + rescue => errorStr + $log.warn "in_win_cadvisor_perf::run_periodic: enumerate Failed to retrieve cadvisor perf metrics for windows nodes: #{errorStr}" + end + end + @mutex.lock + end + @mutex.unlock + end + end # Win_CAdvisor_Perf_Input +end # module diff --git a/source/code/plugin/out_mdm.rb b/source/code/plugin/out_mdm.rb index 93b32ef50..351198afe 100644 --- a/source/code/plugin/out_mdm.rb +++ b/source/code/plugin/out_mdm.rb @@ -2,29 +2,27 @@ # frozen_string_literal: true module Fluent - class OutputMDM < BufferedOutput - config_param :retry_mdm_post_wait_minutes, :integer - Plugin.register_output('out_mdm', self) + Plugin.register_output("out_mdm", self) def initialize super - require 'net/http' - require 'net/https' - require 'uri' - require 'json' - require_relative 'KubernetesApiClient' - require_relative 'ApplicationInsightsUtility' + require "net/http" + require "net/https" + require "uri" + require "json" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" - @@token_resource_url = 'https://monitoring.azure.com/' - @@grant_type = 'client_credentials' - @@azure_json_path = '/etc/kubernetes/host/azure.json' + @@token_resource_url = "https://monitoring.azure.com/" + @@grant_type = "client_credentials" + @@azure_json_path = "/etc/kubernetes/host/azure.json" @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" @@token_url_template = "https://login.microsoftonline.com/%{tenant_id}/oauth2/token" @@plugin_name = "AKSCustomMetricsMDM" - + @data_hash = {} @token_url = nil @http_client = nil @@ -50,12 +48,13 @@ def start @can_send_data_to_mdm = false return end - # Handle the case where the file read fails. Send Telemetry and exit the plugin? + # Handle the case where the file read fails. Send Telemetry and exit the plugin? @data_hash = JSON.parse(file) - @token_url = @@token_url_template % {tenant_id: @data_hash['tenantId']} + @token_url = @@token_url_template % {tenant_id: @data_hash["tenantId"]} @cached_access_token = get_access_token - aks_resource_id = ENV['AKS_RESOURCE_ID'] - aks_region = ENV['AKS_REGION'] + aks_resource_id = ENV["AKS_RESOURCE_ID"] + aks_region = ENV["AKS_REGION"] + if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " @can_send_data_to_mdm = false @@ -77,7 +76,7 @@ def start # get the access token only if the time to expiry is less than 5 minutes def get_access_token - if @cached_access_token.to_s.empty? || (Time.now + 5*60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration + if @cached_access_token.to_s.empty? || (Time.now + 5 * 60 > @token_expiry_time) # token is valid for 60 minutes. Refresh token 5 minutes from expiration @log.info "Refreshing access token for out_mdm plugin.." token_uri = URI.parse(@token_url) http_access_token = Net::HTTP.new(token_uri.host, token_uri.port) @@ -85,27 +84,27 @@ def get_access_token token_request = Net::HTTP::Post.new(token_uri.request_uri) token_request.set_form_data( { - 'grant_type' => @@grant_type, - 'client_id' => @data_hash['aadClientId'], - 'client_secret' => @data_hash['aadClientSecret'], - 'resource' => @@token_resource_url - } + "grant_type" => @@grant_type, + "client_id" => @data_hash["aadClientId"], + "client_secret" => @data_hash["aadClientSecret"], + "resource" => @@token_resource_url, + } ) - + token_response = http_access_token.request(token_request) - # Handle the case where the response is not 200 + # Handle the case where the response is not 200 parsed_json = JSON.parse(token_response.body) - @token_expiry_time = Time.now + 59*60 # set the expiry time to be ~one hour from current time - @cached_access_token = parsed_json['access_token'] + @token_expiry_time = Time.now + 59 * 60 # set the expiry time to be ~one hour from current time + @cached_access_token = parsed_json["access_token"] end @cached_access_token - end + end def write_status_file(success, message) - fn = '/var/opt/microsoft/omsagent/log/MDMIngestion.status' + fn = "/var/opt/microsoft/omsagent/log/MDMIngestion.status" status = '{ "operation": "MDMIngestion", "success": "%s", "message": "%s" }' % [success, message] begin - File.open(fn,'w') { |file| file.write(status) } + File.open(fn, "w") { |file| file.write(status) } rescue => e @log.debug "Error:'#{e}'" ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) @@ -123,13 +122,13 @@ def format(tag, time, record) end end - # This method is called every flush interval. Send the buffer chunk to MDM. + # This method is called every flush interval. Send the buffer chunk to MDM. # 'chunk' is a buffer chunk that includes multiple formatted records def write(chunk) begin - if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes*60)) && @can_send_data_to_mdm + if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm post_body = [] - chunk.msgpack_each {|(tag, record)| + chunk.msgpack_each { |(tag, record)| post_body.push(record.to_json) } send_to_mdm post_body @@ -137,21 +136,22 @@ def write(chunk) if !@can_send_data_to_mdm @log.info "Cannot send data to MDM since all required conditions were not met" else - @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time)/60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" + @log.info "Last Failed POST attempt to MDM was made #{((Time.now - @last_post_attempt_time) / 60).round(1)} min ago. This is less than the current retry threshold of #{@retry_mdm_post_wait_minutes} min. NO-OP" end end rescue Exception => e + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) @log.info "Exception when writing to MDM: #{e}" raise e end end - def send_to_mdm(post_body) + def send_to_mdm(post_body) begin access_token = get_access_token request = Net::HTTP::Post.new(@post_request_uri.request_uri) - request['Content-Type'] = "application/x-ndjson" - request['Authorization'] = "Bearer #{access_token}" + request["Content-Type"] = "application/x-ndjson" + request["Authorization"] = "Bearer #{access_token}" request.body = post_body.join("\n") response = @http_client.request(request) response.value # this throws for non 200 HTTP response code @@ -164,12 +164,11 @@ def send_to_mdm(post_body) @log.info "Response Code #{response.code} Updating @last_post_attempt_time" @last_post_attempt_time = Time.now @first_post_attempt_made = true - ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) # Not raising exception, as that will cause retries to happen - elsif !response.code.empty? && response.code.start_with?('4') + elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" - else + else # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" raise e @@ -186,7 +185,8 @@ def send_to_mdm(post_body) raise e end end - private + + private class ChunkErrorHandler include Configurable @@ -218,20 +218,20 @@ def router=(r) end def write(chunk) - chunk.msgpack_each {|(tag, record)| + chunk.msgpack_each { |(tag, record)| @error_handlers[tag].emit(record) } end - - private + + private def create_error_handlers(router) nop_handler = NopErrorHandler.new Hash.new() { |hash, tag| etag = OMS::Common.create_error_tag tag hash[tag] = router.match?(etag) ? - ErrorHandler.new(router, etag) : - nop_handler + ErrorHandler.new(router, etag) : + nop_handler } end @@ -251,10 +251,6 @@ def emit(record) # NOP end end - end - end # class OutputMDM - end # module Fluent -