Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion installer/conf/td-agent-bit.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[SERVICE]
Flush 15
#Default service flush interval is 15 seconds
${SERVICE_FLUSH_INTERVAL}
Log_Level info
Parsers_File /etc/td-agent-bit/parsers.conf
Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log
Expand All @@ -12,6 +13,8 @@
DB.Sync Off
Parser docker
Mem_Buf_Limit 10m
${TAIL_BUFFER_CHUNK_SIZE}
${TAIL_BUFFER_MAX_SIZE}
Rotate_Wait 20
Refresh_Interval 30
Path_Key filepath
Expand Down
1 change: 1 addition & 0 deletions installer/datafiles/base_container.data
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ MAINTAINER: 'Microsoft Corporation'
/opt/livenessprobe.sh; installer/scripts/livenessprobe.sh; 755; root; root
/opt/tomlparser.rb; installer/scripts/tomlparser.rb; 755; root; root
/opt/tomlparser-prom-customconfig.rb; installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root
/opt/td-agent-bit-conf-customizer.rb; installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root

%Links
/opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root
Expand Down
47 changes: 47 additions & 0 deletions installer/scripts/td-agent-bit-conf-customizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/local/bin/ruby

@td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf"

@default_service_interval = "15"

def is_number?(value)
true if Integer(value) rescue false
end

def substituteFluentBitPlaceHolders
begin
# Replace the fluentbit config file with custom values if present
puts "config::Starting to substitute the placeholders in td-agent-bit.conf file for log collection"

interval = ENV["FBIT_SERVICE_FLUSH_INTERVAL"]
bufferChunkSize = ENV["FBIT_TAIL_BUFFER_CHUNK_SIZE"]
bufferMaxSize = ENV["FBIT_TAIL_BUFFER_MAX_SIZE"]

serviceInterval = (!interval.nil? && is_number?(interval)) ? interval : @default_service_interval
serviceIntervalSetting = "Flush " + serviceInterval

tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize)) ? bufferChunkSize : nil

tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize)) ? bufferMaxSize : nil

text = File.read(@td_agent_bit_conf_path)
new_contents = text.gsub("${SERVICE_FLUSH_INTERVAL}", serviceIntervalSetting)
if !tailBufferChunkSize.nil?
new_contents = new_contents.gsub("${TAIL_BUFFER_CHUNK_SIZE}", "Buffer_Chunk_Size " + tailBufferChunkSize + "m")
else
new_contents = new_contents.gsub("\n ${TAIL_BUFFER_CHUNK_SIZE}\n", "\n")
end
if !tailBufferMaxSize.nil?
new_contents = new_contents.gsub("${TAIL_BUFFER_MAX_SIZE}", "Buffer_Max_Size " + tailBufferMaxSize + "m")
else
new_contents = new_contents.gsub("\n ${TAIL_BUFFER_MAX_SIZE}\n", "\n")
end

File.open(@td_agent_bit_conf_path, "w") { |file| file.puts new_contents }
puts "config::Successfully substituted the placeholders in td-agent-bit.conf file"
rescue => errorStr
puts "td-agent-bit-config-customizer: error while substituting values: #{errorStr}"
end
end

substituteFluentBitPlaceHolders
34 changes: 10 additions & 24 deletions source/code/plugin/CAdvisorMetricsAPIClient.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,11 @@ class CAdvisorMetricsAPIClient
@clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"]
@clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"]

@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"]
@dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"]

@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"]
@dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"]

@rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"]
@dsPromFieldDropCount = ENV["TELEMETRY_DS_PROM_FIELDDROP_LENGTH"]

@rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"]

@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"]
@dsPromUrlCount = ENV["TELEMETRY_DS_PROM_URLS_LENGTH"]

@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"]


@LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt"
@Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M
# @@rxBytesLast = nil
Expand Down Expand Up @@ -118,17 +106,21 @@ def getCAdvisorUri(winNode)
def getMetrics(winNode = nil)
metricDataItems = []
begin
cAdvisorStats = getSummaryStatsFromCAdvisor(winNode)
if !cAdvisorStats.nil?
metricInfo = JSON.parse(cAdvisorStats.body)
end
if !winNode.nil?
hostName = winNode["Hostname"]
operatingSystem = "Windows"
else
hostName = (OMS::Common.get_hostname)
if !metricInfo.nil? && !metricInfo["node"].nil? && !metricInfo["node"]["nodeName"].nil?
hostName = metricInfo["node"]["nodeName"]
else
hostName = (OMS::Common.get_hostname)
end
operatingSystem = "Linux"
end
cAdvisorStats = getSummaryStatsFromCAdvisor(winNode)
if !cAdvisorStats.nil?
metricInfo = JSON.parse(cAdvisorStats.body)
end
if !metricInfo.nil?
metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", "memoryWorkingSetBytes"))
metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch"))
Expand Down Expand Up @@ -228,18 +220,12 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met
telemetryProps["clusterLogTailPath"] = @clusterLogTailPath
telemetryProps["clusterAgentSchemaVersion"] = @clusterAgentSchemaVersion
end
#telemetry about prometheus metric collections settings
#telemetry about prometheus metric collections settings for daemonset
if (File.file?(@promConfigMountPath))
telemetryProps["rsPromInt"] = @rsPromInterval
telemetryProps["dsPromInt"] = @dsPromInterval
telemetryProps["rsPromFPC"] = @rsPromFieldPassCount
telemetryProps["dsPromFPC"] = @dsPromFieldPassCount
telemetryProps["rsPromFDC"] = @rsPromFieldDropCount
telemetryProps["dsPromFDC"] = @dsPromFieldDropCount
telemetryProps["rsPromServ"] = @rsPromK8sServiceCount
telemetryProps["rsPromUrl"] = @rsPromUrlCount
telemetryProps["dsPromUrl"] = @dsPromUrlCount
telemetryProps["rsPromMonPods"] = @rsPromMonitorPods
end
ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps)
end
Expand Down
6 changes: 6 additions & 0 deletions source/code/plugin/KubernetesApiClient.rb
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName
metricProps = {}
metricProps["Timestamp"] = metricTime
metricProps["Host"] = nodeName
# Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent
metricProps["Computer"] = nodeName
metricProps["ObjectName"] = "K8SContainer"
metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName

Expand All @@ -378,6 +380,8 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName
metricProps = {}
metricProps["Timestamp"] = metricTime
metricProps["Host"] = nodeName
# Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent
metricProps["Computer"] = nodeName
metricProps["ObjectName"] = "K8SContainer"
metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName

Expand Down Expand Up @@ -420,6 +424,8 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet
metricProps = {}
metricProps["Timestamp"] = metricTime
metricProps["Host"] = node["metadata"]["name"]
# Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent
metricProps["Computer"] = node["metadata"]["name"]
metricProps["ObjectName"] = "K8SNode"
metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"]
metricProps["Collections"] = []
Expand Down
23 changes: 21 additions & 2 deletions source/code/plugin/in_kube_nodes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ class Kube_nodeInventory_Input < Input

@@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory"
@@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory"
@@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings"

@@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"]
@@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"]
@@rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"]
@@rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"]
@@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"]
@@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"]

def initialize
super
Expand Down Expand Up @@ -124,15 +132,26 @@ def enumerate
# Adding telemetry to send node telemetry every 5 minutes
timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs
timeDifferenceInMinutes = timeDifference / 60
if (timeDifferenceInMinutes >= 5)
if (timeDifferenceInMinutes >= 10)
properties = {}
properties["Computer"] = record["Computer"]
properties["KubeletVersion"] = record["KubeletVersion"]
properties["OperatingSystem"] = nodeInfo["operatingSystem"]
properties["DockerVersion"] = dockerVersion

capacityInfo = items["status"]["capacity"]
ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties)

#telemetry about prometheus metric collections settings for replicaset
if (File.file?(@@promConfigMountPath))
properties["rsPromInt"] = @@rsPromInterval
properties["rsPromFPC"] = @@rsPromFieldPassCount
properties["rsPromFDC"] = @@rsPromFieldDropCount
properties["rsPromServ"] = @@rsPromK8sServiceCount
properties["rsPromUrl"] = @@rsPromUrlCount
properties["rsPromMonPods"] = @@rsPromMonitorPods
end
ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties)
telemetrySent = true
end
end
Expand Down
15 changes: 12 additions & 3 deletions source/code/plugin/in_kube_podinventory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def parse_and_emit_records(podInventory, serviceList)
batchTime = currentTime.utc.iso8601
eventStream = MultiEventStream.new
controllerSet = Set.new []
controllerData = {}
telemetryFlush = false
winContainerCount = 0
begin #begin block start
Expand Down Expand Up @@ -277,6 +278,13 @@ def parse_and_emit_records(podInventory, serviceList)
record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"]
if telemetryFlush == true
controllerSet.add(record["ControllerKind"] + record["ControllerName"])
#Adding controller kind to telemetry ro information about customer workload
if (controllerData[record["ControllerKind"]].nil?)
controllerData[record["ControllerKind"]] = 1
else
controllerValue = controllerData[record["ControllerKind"]]
controllerData[record["ControllerKind"]] += 1
end
end
end
podRestartCount = 0
Expand Down Expand Up @@ -329,7 +337,7 @@ def parse_and_emit_records(podInventory, serviceList)
end

# Record the last state of the container. This may have information on why a container was killed.
begin
begin
if !container["lastState"].nil? && container["lastState"].keys.length == 1
lastStateName = container["lastState"].keys[0]
lastStateObject = container["lastState"][lastStateName]
Expand All @@ -338,7 +346,7 @@ def parse_and_emit_records(podInventory, serviceList)
end

if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt")
newRecord = Hash.new
newRecord = Hash.new
newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated)
newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled)
newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z)
Expand Down Expand Up @@ -403,7 +411,8 @@ def parse_and_emit_records(podInventory, serviceList)
telemetryProperties["Computer"] = @@hostName
ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties)
ApplicationInsightsUtility.sendMetricTelemetry("PodCount", podInventory["items"].length, {})
ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, {})
telemetryProperties["ControllerData"] = controllerData.to_json
ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", controllerSet.length, telemetryProperties)
if winContainerCount > 0
telemetryProperties["ClusterWideWindowsContainersCount"] = winContainerCount
ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties)
Expand Down