diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index ab79710c7..4e3de6c46 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -1,5 +1,6 @@ [SERVICE] - Flush 15 + #Default service flush interval is 15 seconds + ${SERVICE_FLUSH_INTERVAL} Log_Level info Parsers_File /etc/td-agent-bit/parsers.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log @@ -12,6 +13,8 @@ DB.Sync Off Parser docker Mem_Buf_Limit 10m + ${TAIL_BUFFER_CHUNK_SIZE} + ${TAIL_BUFFER_MAX_SIZE} Rotate_Wait 20 Refresh_Interval 30 Path_Key filepath diff --git a/installer/datafiles/base_container.data b/installer/datafiles/base_container.data index fe1635335..62a6f6885 100644 --- a/installer/datafiles/base_container.data +++ b/installer/datafiles/base_container.data @@ -114,6 +114,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root /opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/td-agent-bit-conf-customizer.rb; installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root %Links /opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root diff --git a/installer/scripts/td-agent-bit-conf-customizer.rb b/installer/scripts/td-agent-bit-conf-customizer.rb new file mode 100644 index 000000000..1e62e3cc2 --- /dev/null +++ b/installer/scripts/td-agent-bit-conf-customizer.rb @@ -0,0 +1,47 @@ +#!/usr/local/bin/ruby + +@td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" + +@default_service_interval = "15" + +def is_number?(value) + true if Integer(value) rescue false +end + +def substituteFluentBitPlaceHolders + begin + # Replace the fluentbit config file with custom values if present + puts "config::Starting to substitute the placeholders in td-agent-bit.conf file for log collection" + + interval = ENV["FBIT_SERVICE_FLUSH_INTERVAL"] + bufferChunkSize = ENV["FBIT_TAIL_BUFFER_CHUNK_SIZE"] + bufferMaxSize = ENV["FBIT_TAIL_BUFFER_MAX_SIZE"] + + serviceInterval = (!interval.nil? && is_number?(interval)) ? interval : @default_service_interval + serviceIntervalSetting = "Flush " + serviceInterval + + tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize)) ? bufferChunkSize : nil + + tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize)) ? bufferMaxSize : nil + + text = File.read(@td_agent_bit_conf_path) + new_contents = text.gsub("${SERVICE_FLUSH_INTERVAL}", serviceIntervalSetting) + if !tailBufferChunkSize.nil? + new_contents = new_contents.gsub("${TAIL_BUFFER_CHUNK_SIZE}", "Buffer_Chunk_Size " + tailBufferChunkSize + "m") + else + new_contents = new_contents.gsub("\n ${TAIL_BUFFER_CHUNK_SIZE}\n", "\n") + end + if !tailBufferMaxSize.nil? + new_contents = new_contents.gsub("${TAIL_BUFFER_MAX_SIZE}", "Buffer_Max_Size " + tailBufferMaxSize + "m") + else + new_contents = new_contents.gsub("\n ${TAIL_BUFFER_MAX_SIZE}\n", "\n") + end + + File.open(@td_agent_bit_conf_path, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in td-agent-bit.conf file" + rescue => errorStr + puts "td-agent-bit-config-customizer: error while substituting values: #{errorStr}" + end +end + +substituteFluentBitPlaceHolders diff --git a/source/code/plugin/CAdvisorMetricsAPIClient.rb b/source/code/plugin/CAdvisorMetricsAPIClient.rb index ec38bcbb5..09499b4cf 100644 --- a/source/code/plugin/CAdvisorMetricsAPIClient.rb +++ b/source/code/plugin/CAdvisorMetricsAPIClient.rb @@ -22,23 +22,11 @@ class CAdvisorMetricsAPIClient @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] - @rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] - - @rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] - - @rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] @dsPromFieldDropCount = ENV["TELEMETRY_DS_PROM_FIELDDROP_LENGTH"] - - @rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] - - @rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] @dsPromUrlCount = ENV["TELEMETRY_DS_PROM_URLS_LENGTH"] - @rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] - - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil @@ -118,17 +106,21 @@ def getCAdvisorUri(winNode) def getMetrics(winNode = nil) metricDataItems = [] begin + cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) + if !cAdvisorStats.nil? + metricInfo = JSON.parse(cAdvisorStats.body) + end if !winNode.nil? hostName = winNode["Hostname"] operatingSystem = "Windows" else - hostName = (OMS::Common.get_hostname) + if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil? + hostName = metricInfo["node"]["nodeName"] + else + hostName = (OMS::Common.get_hostname) + end operatingSystem = "Linux" end - cAdvisorStats = getSummaryStatsFromCAdvisor(winNode) - if !cAdvisorStats.nil? - metricInfo = JSON.parse(cAdvisorStats.body) - end if !metricInfo.nil? metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes")) metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch")) @@ -228,18 +220,12 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["clusterLogTailPath"] = @clusterLogTailPath telemetryProps["clusterAgentSchemaVersion"] = @clusterAgentSchemaVersion end - #telemetry about prometheus metric collections settings + #telemetry about prometheus metric collections settings for daemonset if (File.file?(@promConfigMountPath)) - telemetryProps["rsPromInt"] = @rsPromInterval telemetryProps["dsPromInt"] = @dsPromInterval - telemetryProps["rsPromFPC"] = @rsPromFieldPassCount telemetryProps["dsPromFPC"] = @dsPromFieldPassCount - telemetryProps["rsPromFDC"] = @rsPromFieldDropCount telemetryProps["dsPromFDC"] = @dsPromFieldDropCount - telemetryProps["rsPromServ"] = @rsPromK8sServiceCount - telemetryProps["rsPromUrl"] = @rsPromUrlCount telemetryProps["dsPromUrl"] = @dsPromUrlCount - telemetryProps["rsPromMonPods"] = @rsPromMonitorPods end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end diff --git a/source/code/plugin/KubernetesApiClient.rb b/source/code/plugin/KubernetesApiClient.rb index 4cbf8bb40..61cbaea00 100644 --- a/source/code/plugin/KubernetesApiClient.rb +++ b/source/code/plugin/KubernetesApiClient.rb @@ -355,6 +355,8 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = nodeName metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName @@ -378,6 +380,8 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = nodeName metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName @@ -420,6 +424,8 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = node["metadata"]["name"] + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = node["metadata"]["name"] metricProps["ObjectName"] = "K8SNode" metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] metricProps["Collections"] = [] diff --git a/source/code/plugin/in_kube_nodes.rb b/source/code/plugin/in_kube_nodes.rb index 0310fa419..24ab51d4c 100644 --- a/source/code/plugin/in_kube_nodes.rb +++ b/source/code/plugin/in_kube_nodes.rb @@ -7,6 +7,14 @@ class Kube_nodeInventory_Input < Input @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" + @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" + + @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] + @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] + @@rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] + @@rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] + @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] + @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] def initialize super @@ -124,15 +132,26 @@ def enumerate # Adding telemetry to send node telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) + if (timeDifferenceInMinutes >= 10) properties = {} properties["Computer"] = record["Computer"] properties["KubeletVersion"] = record["KubeletVersion"] properties["OperatingSystem"] = nodeInfo["operatingSystem"] properties["DockerVersion"] = dockerVersion + capacityInfo = items["status"]["capacity"] - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true end end diff --git a/source/code/plugin/in_kube_podinventory.rb b/source/code/plugin/in_kube_podinventory.rb index 9991c13e3..f41ce9095 100644 --- a/source/code/plugin/in_kube_podinventory.rb +++ b/source/code/plugin/in_kube_podinventory.rb @@ -182,6 +182,7 @@ def parse_and_emit_records(podInventory, serviceList) batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new controllerSet = Set.new [] + controllerData = {} telemetryFlush = false winContainerCount = 0 begin #begin block start @@ -277,6 +278,13 @@ def parse_and_emit_records(podInventory, serviceList) record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] if telemetryFlush == true controllerSet.add(record["ControllerKind"] + record["ControllerName"]) + #Adding controller kind to telemetry ro information about customer workload + if (controllerData[record["ControllerKind"]].nil?) + controllerData[record["ControllerKind"]] = 1 + else + controllerValue = controllerData[record["ControllerKind"]] + controllerData[record["ControllerKind"]] += 1 + end end end podRestartCount = 0 @@ -329,7 +337,7 @@ def parse_and_emit_records(podInventory, serviceList) end # Record the last state of the container. This may have information on why a container was killed. - begin + begin if !container["lastState"].nil? && container["lastState"].keys.length == 1 lastStateName = container["lastState"].keys[0] lastStateObject = container["lastState"][lastStateName] @@ -338,7 +346,7 @@ def parse_and_emit_records(podInventory, serviceList) end if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") - newRecord = Hash.new + newRecord = Hash.new newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) @@ -403,7 +411,8 @@ def parse_and_emit_records(podInventory, serviceList) telemetryProperties["Computer"] = @@hostName ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {}) - ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, {}) + telemetryProperties["ControllerData"] = controllerData.to_json + ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, telemetryProperties) if winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = winContainerCount ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties)