diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/common/installer/scripts/tomlparser-mdm-metrics-config.rb similarity index 75% rename from build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb rename to build/common/installer/scripts/tomlparser-mdm-metrics-config.rb index dcf179bf2..b6a4419cf 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/common/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -1,9 +1,16 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require_relative "tomlrb" -require_relative "ConfigParseErrorLogger" +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + require_relative "/etc/fluent/plugin/constants" +require_relative "ConfigParseErrorLogger" @configMapMountPath = "/etc/config/settings/alertable-metrics-configuration-settings" @configVersion = "" @@ -124,6 +131,10 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end +def get_command_windows(env_variable_name, env_variable_value) + return "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Process\")" + "\n" + "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Machine\")" + "\n" +end + @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] puts "****************Start MDM Metrics Config Processing********************" if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it @@ -137,19 +148,37 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end -# Write the settings to file, so that they can be set as environment variables -file = File.open("config_mdm_metrics_env_var", "w") +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + # Write the settings to file, so that they can be set as environment variables in windows container + file = File.open("setmdmenv.ps1", "w") -if !file.nil? - file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") - file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") - file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") - file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") - file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") - # Close file after writing all MDM setting environment variables - file.close - puts "****************End MDM Metrics Config Processing********************" + if !file.nil? + commands = get_command_windows("AZMON_ALERT_CONTAINER_CPU_THRESHOLD", @percentageCpuUsageThreshold) + file.write(commands) + commands = get_command_windows("AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD", @percentageMemoryWorkingSetThreshold) + file.write(commands) + # Close file after writing all environment variables + file.close + puts "****************End MDM Metrics Config Processing********************" + else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End MDM Metrics Config Processing********************" + end else - puts "Exception while opening file for writing MDM metric config environment variables" - puts "****************End MDM Metrics Config Processing********************" + # Write the settings to file, so that they can be set as environment variables in linux container + file = File.open("config_mdm_metrics_env_var", "w") + + if !file.nil? + file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") + file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") + file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") + file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") + file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") + # Close file after writing all MDM setting environment variables + file.close + puts "****************End MDM Metrics Config Processing********************" + else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End MDM Metrics Config Processing********************" + end end diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index b9f889dba..de8ccbba0 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -42,7 +42,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/common/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root diff --git a/build/windows/Makefile.ps1 b/build/windows/Makefile.ps1 index 2d49330ea..737abc92a 100644 --- a/build/windows/Makefile.ps1 +++ b/build/windows/Makefile.ps1 @@ -180,4 +180,14 @@ $exclude = @('*.cs','*.csproj') Copy-Item -Path $installerdir -Destination $publishdir -Recurse -Force -Exclude $exclude Write-Host("successfully copied installer files conf and scripts from :" + $installerdir + " to :" + $publishdir + " ") -ForegroundColor Green +$rubyplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\ruby" +Write-Host("copying ruby source files from :" + $rubyplugindir + " to :" + $publishdir + " ...") +Copy-Item -Path $rubyplugindir -Destination $publishdir -Recurse -Force +Write-Host("successfully copied ruby source files from :" + $rubyplugindir + " to :" + $publishdir + " ") -ForegroundColor Green + +$utilsplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\utils" +Write-Host("copying ruby util files from :" + $utilsplugindir + " to :" + $publishdir + " ...") +Copy-Item -Path $utilsplugindir -Destination $publishdir -Recurse -Force +Write-Host("successfully copied ruby util files from :" + $utilsplugindir + " to :" + $publishdir + " ") -ForegroundColor Green + Set-Location $currentdir \ No newline at end of file diff --git a/build/windows/installer/conf/fluent.conf b/build/windows/installer/conf/fluent.conf index d5eb475ca..741e5ce19 100644 --- a/build/windows/installer/conf/fluent.conf +++ b/build/windows/installer/conf/fluent.conf @@ -4,6 +4,13 @@ @log_level info + + @type cadvisor_perf + tag oms.api.cadvisorperf + run_interval 60 + @log_level debug + + @type tail path "#{ENV['AZMON_LOG_TAIL_PATH']}" @@ -29,6 +36,14 @@ @include fluent-docker-parser.conf +#custom_metrics_mdm filter plugin + + @type cadvisor2mdm + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_path /etc/omsagentwindows/filter_cadvisor2mdm.log + @log_level info + + @type grep @@ -46,6 +61,23 @@ + + @type mdm + @log_level debug + + @type file + path /etc/omsagentwindows/out_mdm_cdvisorperf.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + retry_mdm_post_wait_minutes 30 + + @type forward send_timeout 60s diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index ab6bbea9c..4290e1d59 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -790,6 +790,9 @@ spec: fieldPath: status.hostIP - name: SIDECAR_SCRAPING_ENABLED value: "true" + # Update this with the user assigned msi client id for omsagent + - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + value: "" # Add this only for clouds that require cert bootstrapping - name: REQUIRES_CERT_BOOTSTRAP value: "true" @@ -812,6 +815,9 @@ spec: # - mountPath: C:\ca # name: ca-certs # readOnly: true + - mountPath: C:\etc\kubernetes\host + name: azure-json-path + readOnly: true livenessProbe: exec: command: @@ -843,6 +849,9 @@ spec: - name: docker-windows-kuberenetes-container-logs hostPath: path: C:\var + - name: azure-json-path + hostPath: + path: C:\k # Need to mount this only for airgapped clouds - Commenting this since it wont exist in non airgapped clouds #- name: ca-certs # hostPath: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index fefd089a8..5a5298d0b 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -20,7 +20,7 @@ RUN refreshenv \ && gem install cool.io -v 1.5.4 --platform ruby \ && gem install oj -v 3.3.10 \ && gem install json -v 2.2.0 \ -&& gem install fluentd -v 1.10.2 \ +&& gem install fluentd -v 1.12.2 \ && gem install win32-service -v 1.0.1 \ && gem install win32-ipc -v 0.7.0 \ && gem install win32-event -v 0.6.3 \ @@ -69,6 +69,10 @@ COPY ./omsagentwindows/installer/conf/telegraf.conf /etc/telegraf/ # copy keepcert alive ruby scripts COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/fluent/plugin/ +#Copy fluentd ruby plugins +COPY ./omsagentwindows/ruby/ /etc/fluent/plugin/ +COPY ./omsagentwindows/utils/*.rb /etc/fluent/plugin/ + ENV AGENT_VERSION ${IMAGE_TAG} ENV OS_TYPE "windows" ENV APPLICATIONINSIGHTS_AUTH "NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi" diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index baf95fca4..bc053b0d6 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -134,9 +134,6 @@ function Set-EnvironmentVariables { [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_ENDPOINT", $appInsightsEndpoint, "machine") Write-Host "Successfully set environment variable APPLICATIONINSIGHTS_ENDPOINT - $($appInsightsEndpoint) for target 'machine'..." } - else { - Write-Host "Failed to set environment variable APPLICATIONINSIGHTS_ENDPOINT for target 'machine' since it is either null or empty" - } # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) $aiKeyURl = [System.Environment]::GetEnvironmentVariable('APPLICATIONINSIGHTS_AUTH_URL') @@ -180,14 +177,71 @@ function Set-EnvironmentVariables { [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Process") [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Machine") + # Setting environment variables required by the fluentd plugins + $aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID", "process") + if (![string]::IsNullOrEmpty($aksResourceId)) { + [System.Environment]::SetEnvironmentVariable("AKS_RESOURCE_ID", $aksResourceId, "machine") + Write-Host "Successfully set environment variable AKS_RESOURCE_ID - $($aksResourceId) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable AKS_RESOURCE_ID for target 'machine' since it is either null or empty" + } + + $aksRegion = [System.Environment]::GetEnvironmentVariable("AKS_REGION", "process") + if (![string]::IsNullOrEmpty($aksRegion)) { + [System.Environment]::SetEnvironmentVariable("AKS_REGION", $aksRegion, "machine") + Write-Host "Successfully set environment variable AKS_REGION - $($aksRegion) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable AKS_REGION for target 'machine' since it is either null or empty" + } + + $controllerType = [System.Environment]::GetEnvironmentVariable("CONTROLLER_TYPE", "process") + if (![string]::IsNullOrEmpty($controllerType)) { + [System.Environment]::SetEnvironmentVariable("CONTROLLER_TYPE", $controllerType, "machine") + Write-Host "Successfully set environment variable CONTROLLER_TYPE - $($controllerType) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable CONTROLLER_TYPE for target 'machine' since it is either null or empty" + } + + $osType = [System.Environment]::GetEnvironmentVariable("OS_TYPE", "process") + if (![string]::IsNullOrEmpty($osType)) { + [System.Environment]::SetEnvironmentVariable("OS_TYPE", $osType, "machine") + Write-Host "Successfully set environment variable OS_TYPE - $($osType) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable OS_TYPE for target 'machine' since it is either null or empty" + } + + $userMsi = [System.Environment]::GetEnvironmentVariable("USER_ASSIGNED_IDENTITY_CLIENT_ID", "process") + if (![string]::IsNullOrEmpty($userMsi)) { + [System.Environment]::SetEnvironmentVariable("USER_ASSIGNED_IDENTITY_CLIENT_ID", $userMsi, "machine") + Write-Host "Successfully set environment variable USER_ASSIGNED_IDENTITY_CLIENT_ID - $($userMsi) for target 'machine'..." + } + + $hostName = [System.Environment]::GetEnvironmentVariable("HOSTNAME", "process") + if (![string]::IsNullOrEmpty($hostName)) { + [System.Environment]::SetEnvironmentVariable("HOSTNAME", $hostName, "machine") + Write-Host "Successfully set environment variable HOSTNAME - $($hostName) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable HOSTNAME for target 'machine' since it is either null or empty" + } + # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb .\setenv.ps1 + + # run mdm config parser + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-mdm-metrics-config.rb + .\setmdmenv.ps1 } function Get-ContainerRuntime { # default container runtime and make default as containerd when containerd becomes default in AKS $containerRuntime = "docker" + $cAdvisorIsSecure = "false" $response = "" $NODE_IP = "" try { @@ -227,6 +281,7 @@ function Get-ContainerRuntime { if (![string]::IsNullOrEmpty($response) -and $response.StatusCode -eq 200) { Write-Host "API call to https://$($NODE_IP):10250/pods succeeded" $isPodsAPISuccess = $true + $cAdvisorIsSecure = "true" } } catch { @@ -234,6 +289,11 @@ function Get-ContainerRuntime { } } + # set IS_SECURE_CADVISOR_PORT env for debug and telemetry purpose + Write-Host "Setting IS_SECURE_CADVISOR_PORT environment variable as $($cAdvisorIsSecure)" + [System.Environment]::SetEnvironmentVariable("IS_SECURE_CADVISOR_PORT", $cAdvisorIsSecure, "Process") + [System.Environment]::SetEnvironmentVariable("IS_SECURE_CADVISOR_PORT", $cAdvisorIsSecure, "Machine") + if ($isPodsAPISuccess) { if (![string]::IsNullOrEmpty($response.Content)) { $podList = $response.Content | ConvertFrom-Json diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index 6ae567337..74d08c1e6 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -21,10 +21,15 @@ class ApplicationInsightsUtility @@EnvApplicationInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" @@EnvControllerType = "CONTROLLER_TYPE" @@EnvContainerRuntime = "CONTAINER_RUNTIME" - + @@isWindows = false + @@hostName = (OMS::Common.get_hostname) + @@os_type = ENV["OS_TYPE"] + if !@@os_type.nil? && !@@os_type.empty? && @@os_type.strip.casecmp("windows") == 0 + @@isWindows = true + @@hostName = ENV["HOSTNAME"] + end @@CustomProperties = {} @@Tc = nil - @@hostName = (OMS::Common.get_hostname) @@proxy = (ProxyUtils.getProxyConfiguration) def initialize @@ -133,16 +138,23 @@ def initializeUtility() end def getContainerRuntimeInfo() - containerRuntime = ENV[@@EnvContainerRuntime] - if !containerRuntime.nil? && !containerRuntime.empty? - # DockerVersion field holds either containerRuntime for non-docker or Dockerversion if its docker - @@CustomProperties["DockerVersion"] = containerRuntime - if containerRuntime.casecmp("docker") == 0 - dockerInfo = DockerApiClient.dockerInfo - if (!dockerInfo.nil? && !dockerInfo.empty?) - @@CustomProperties["DockerVersion"] = dockerInfo["Version"] + begin + # Not doing this for windows since docker is being deprecated soon and we dont want to bring in the socket dependency. + if !@@isWindows.nil? && @@isWindows == false + containerRuntime = ENV[@@EnvContainerRuntime] + if !containerRuntime.nil? && !containerRuntime.empty? + # DockerVersion field holds either containerRuntime for non-docker or Dockerversion if its docker + @@CustomProperties["DockerVersion"] = containerRuntime + if containerRuntime.casecmp("docker") == 0 + dockerInfo = DockerApiClient.dockerInfo + if (!dockerInfo.nil? && !dockerInfo.empty?) + @@CustomProperties["DockerVersion"] = dockerInfo["Version"] + end + end end end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: getContainerRuntimeInfo - error: #{errorStr}") end end @@ -262,7 +274,7 @@ def sendMetricTelemetry(metricName, metricValue, properties) end def getWorkspaceId() - begin + begin workspaceId = ENV["WSID"] if workspaceId.nil? || workspaceId.empty? $log.warn("Exception in AppInsightsUtility: getWorkspaceId - WorkspaceID either nil or empty") @@ -274,7 +286,7 @@ def getWorkspaceId() end def getWorkspaceCloud() - begin + begin workspaceDomain = ENV["DOMAIN"] workspaceCloud = "AzureCloud" if workspaceDomain.casecmp("opinsights.azure.com") == 0 diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index f02459aef..10720752d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -38,7 +38,12 @@ class CAdvisorMetricsAPIClient @npmIntegrationBasic = ENV["TELEMETRY_NPM_INTEGRATION_METRICS_BASIC"] @npmIntegrationAdvanced = ENV["TELEMETRY_NPM_INTEGRATION_METRICS_ADVANCED"] - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @LogPath = "/etc/omsagentwindows/kubernetes_perf_log.txt" + else + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + end @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil # @@rxBytesTimeLast = nil @@ -142,39 +147,54 @@ def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) operatingSystem = "Linux" end if !metricInfo.nil? - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime, operatingSystem)) - metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch", metricTime)) - - if operatingSystem == "Linux" - metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", Constants::CPU_USAGE_NANO_CORES, metricTime)) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime, operatingSystem)) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime)) - elsif operatingSystem == "Windows" + # Checking if we are in windows daemonset and sending only few metrics that are needed for MDM + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + # Container metrics + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime, operatingSystem)) containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, metricTime) if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? metricDataItems.concat(containerCpuUsageNanoSecondsRate) end - end + # Node metrics + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, operatingSystem, metricTime) + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime)) + else + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime, operatingSystem)) + metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch", metricTime)) + + if operatingSystem == "Linux" + metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", Constants::CPU_USAGE_NANO_CORES, metricTime)) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime, operatingSystem)) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime)) + elsif operatingSystem == "Windows" + containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, metricTime) + if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? + metricDataItems.concat(containerCpuUsageNanoSecondsRate) + end + end - cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, operatingSystem, metricTime) - if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? - metricDataItems.push(cpuUsageNanoSecondsRate) + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, operatingSystem, metricTime) + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime)) + + metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch", metricTime)) + # Disabling networkRxRate and networkTxRate since we dont use it as of now. + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) + # networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") + # if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? + # metricDataItems.push(networkRxRate) + # end + # networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") + # if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? + # metricDataItems.push(networkTxRate) + # end end - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime)) - - metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch", metricTime)) - - # Disabling networkRxRate and networkTxRate since we dont use it as of now. - #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) - #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) - # networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") - # if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? - # metricDataItems.push(networkRxRate) - # end - # networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") - # if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? - # metricDataItems.push(networkTxRate) - # end else @Log.warn("Couldn't get metric information for host: #{hostName}") end @@ -203,7 +223,6 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = metricPollTime #container["cpu"]["time"] - metricItem = {} metricItem["Timestamp"] = metricTime diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 3720bf6dc..4b50e20d8 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -25,7 +25,12 @@ class KubernetesApiClient #@@IsValidRunningNode = nil #@@IsLinuxCluster = nil @@KubeSystemNamespace = "kube-system" - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @LogPath = "/etc/omsagentwindows/kubernetes_client_log.txt" + else + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + end @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@TokenStr = nil diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index a809087dc..73cf19fac 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -10,7 +10,12 @@ class MdmMetricsGenerator require_relative "constants" require_relative "oms_common" - @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @log_path = "/etc/omsagentwindows/mdm_metrics_generator.log" + else + @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" + end @log = Logger.new(@log_path, 1, 5000000) @@hostName = (OMS::Common.get_hostname) diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb index 552dafb1f..39b8c1c96 100644 --- a/source/plugins/ruby/arc_k8s_cluster_identity.rb +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -18,15 +18,20 @@ class ArcK8sClusterIdentity @@crd_resource_uri_template = "%{kube_api_server_url}/apis/%{cluster_config_crd_api_version}/namespaces/%{cluster_identity_resource_namespace}/azureclusteridentityrequests/%{cluster_identity_resource_name}" @@secret_resource_uri_template = "%{kube_api_server_url}/api/v1/namespaces/%{cluster_identity_token_secret_namespace}/secrets/%{token_secret_name}" @@azure_monitor_custom_metrics_audience = "https://monitoring.azure.com/" - @@cluster_identity_request_kind = "AzureClusterIdentityRequest" + @@cluster_identity_request_kind = "AzureClusterIdentityRequest" def initialize - @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @LogPath = "/etc/omsagentwindows/arc_k8s_cluster_identity.log" + else + @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" + end @log = Logger.new(@LogPath, 1, 5000000) @log.info "initialize start @ #{Time.now.utc.iso8601}" @token_expiry_time = Time.now @cached_access_token = String.new - @isLastTokenRenewalUpdatePending = false + @isLastTokenRenewalUpdatePending = false @token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @kube_api_server_url = KubernetesApiClient.getKubeAPIServerUrl @@ -34,8 +39,8 @@ def initialize @log.warn "got api server url nil from KubernetesApiClient.getKubeAPIServerUrl @ #{Time.now.utc.iso8601}" end @http_client = get_http_client - @service_account_token = get_service_account_token - @extensionName = ENV["ARC_K8S_EXTENSION_NAME"] + @service_account_token = get_service_account_token + @extensionName = ENV["ARC_K8S_EXTENSION_NAME"] @log.info "extension name:#{@extensionName} @ #{Time.now.utc.iso8601}" @log.info "initialize complete @ #{Time.now.utc.iso8601}" end @@ -55,7 +60,7 @@ def get_cluster_identity_token() @isLastTokenRenewalUpdatePending = true else @log.warn "last token renewal update still pending @ #{Time.now.utc.iso8601}" - end + end end @log.info "get token reference from crd @ #{Time.now.utc.iso8601}" tokenReference = get_token_reference_from_crd @@ -68,7 +73,7 @@ def get_cluster_identity_token() token = get_token_from_secret(token_secret_name, token_secret_data_name) if !token.nil? @cached_access_token = token - @isLastTokenRenewalUpdatePending = false + @isLastTokenRenewalUpdatePending = false else @log.warn "got token nil from secret: #{@token_secret_name}" end @@ -141,7 +146,7 @@ def get_token_reference_from_crd() create_request.body = crd_request_body_json create_response = @http_client.request(create_request) @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" - end + end rescue => err @log.warn "get_token_reference_from_crd call failed: #{err}" ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) @@ -159,7 +164,7 @@ def renew_near_expiry_token() cluster_identity_resource_namespace: @@cluster_identity_resource_namespace, cluster_identity_resource_name: @@cluster_identity_resource_name, } - update_crd_request_body = { 'status': {'expirationTime': ''} } + update_crd_request_body = { 'status': { 'expirationTime': "" } } update_crd_request_body_json = update_crd_request_body.to_json update_crd_request_uri = crd_request_uri + "/status" update_request = Net::HTTP::Patch.new(update_crd_request_uri) @@ -234,9 +239,9 @@ def get_crd_request_body body["metadata"]["namespace"] = @@cluster_identity_resource_namespace body["spec"] = {} body["spec"]["audience"] = @@azure_monitor_custom_metrics_audience - if !@extensionName.nil? && !@extensionName.empty? - body["spec"]["resourceId"] = @extensionName - end + if !@extensionName.nil? && !@extensionName.empty? + body["spec"]["resourceId"] = @extensionName + end return body end end diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 62dcf31dc..9c6b661b0 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true -require 'fluent/plugin/filter' +require "fluent/plugin/filter" module Fluent::Plugin require "logger" @@ -28,6 +28,12 @@ class CAdvisor2MdmFilter < Filter @@metric_threshold_hash = {} @@controller_type = "" + @@isWindows = false + @@os_type = ENV["OS_TYPE"] + if !@@os_type.nil? && !@@os_type.empty? && @@os_type.strip.casecmp("windows") == 0 + @@isWindows = true + end + def initialize super end @@ -130,15 +136,17 @@ def flushMetricTelemetry # Also send for PV usage metrics begin - pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs - pvTimeDifferenceInMinutes = pvTimeDifference / 60 - if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - pvProperties = {} - pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] - pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold - ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) - @pvExceededUsageThreshold = false - @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i + if !@@isWindows.nil? && @@isWindows == false + pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs + pvTimeDifferenceInMinutes = pvTimeDifference / 60 + if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + pvProperties = {} + pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] + pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) + @pvExceededUsageThreshold = false + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i + end end rescue => errorStr @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry" @@ -346,7 +354,6 @@ def ensure_cpu_memory_capacity_set # cpu_capacity and memory_capacity keep initialized value of 0.0 @log.error "Error getting capacity_from_kubelet: cpu_capacity and memory_capacity" end - end end diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index 781042cea..b3f9bd08b 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -1,16 +1,20 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin - class CAdvisor_Perf_Input < Input Fluent::Plugin.register_input("cadvisor_perf", self) + @@isWindows = false + @@os_type = ENV["OS_TYPE"] + if !@@os_type.nil? && !@@os_type.empty? && @@os_type.strip.casecmp("windows") == 0 + @@isWindows = true + end def initialize super require "yaml" - require 'yajl/json_gem' + require "yajl/json_gem" require "time" require_relative "CAdvisorMetricsAPIClient" @@ -69,31 +73,32 @@ def enumerate() router.emit_stream(@containerhealthtag, eventStream) if eventStream router.emit_stream(@nodehealthtag, eventStream) if eventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end #start GPU InsightsMetrics items begin - containerGPUusageInsightsMetricsDataItems = [] - containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) + if !@@isWindows.nil? && @@isWindows == false + containerGPUusageInsightsMetricsDataItems = [] + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream - router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end rescue => errorStr $log.warn "Failed when processing GPU Usage metrics in_cadvisor_perf : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + end #end GPU InsightsMetrics items rescue => errorStr diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 61e823ea6..9ab2474b1 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin class Win_CAdvisor_Perf_Input < Input @@ -20,7 +20,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -57,7 +57,7 @@ def enumerate() begin timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - @@istestvar = ENV["ISTEST"] + @@istestvar = ENV["ISTEST"] #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() @@ -79,7 +79,6 @@ def enumerate() end end router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@mdmtag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -89,10 +88,10 @@ def enumerate() begin containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) - insightsMetricsEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index e2c731b79..22bc87c0e 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -9,7 +9,12 @@ require "bigdecimal" class KubeletUtils - @log_path = "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @log_path = "/etc/omsagentwindows/filter_cadvisor2mdm.log" + else + @log_path = "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" + end @log = Logger.new(@log_path, 1, 5000000) class << self