From eb2c5f34e6e4bf460e1443670cb7aacff1bb30b2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 4 Aug 2020 17:00:24 -0700 Subject: [PATCH 001/194] separate build yamls for ci_prod branch (#415) --- ...l.all_tag.all_phase.all_config.ci_prod.yml | 44 +++++++++++++++ ...l.all_tag.all_phase.all_config.ci_prod.yml | 55 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 .pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml create mode 100644 .pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml diff --git a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml new file mode 100644 index 000000000..d47a60ffe --- /dev/null +++ b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml @@ -0,0 +1,44 @@ +environment: + host: + os: 'linux' + flavor: 'ubuntu' + version: '16.04' + runtime: + provider: 'appcontainer' + image: 'cdpxlinux.azurecr.io/user/azure-monitor/container-insights:1.0' + +version: + name: 'DockerProvider' + major: 10 + minor: 0 + tag: 'beta' + system: 'custom' + exclude_commit: true + +restore: + commands: + - !!defaultcommand + name: 'get go modules' + command: '.pipelines/restore-linux.sh' + fail_on_stderr: false + +build: + commands: + - !!defaultcommand + name: 'Build Docker Provider Shell Bundle' + command: '.pipelines/build-linux.sh' + fail_on_stderr: false + +package: + commands: + - !!dockerbuildcommand # REQUIRED: This maps the command data to a concrete type in the CDPX orchestrator. + name: 'Build Docker Image' # REQUIRED: All commands have a name field. All console output captured when + # this command runs is tagged with the value of this field. + context_folder: 'kubernetes/linux' # REQUIRED: The repository root relative path of the folder containing the Dockerfile to build. + # In effect, the context folder will be repository_checkout_folder/src/DockerFinal. + dockerfile_name: 'Dockerfile' # OPTIONAL: The name of the dockerfile. Docker client does allow the Dockerfile + # to be named differently. Defaults to Dockerfile. + # In effect, the -f option value passed to docker build will be repository_checkout_folder/src/DockerFinal/Foo.dockerfile. + repository_name: 'cdpxlinux' # only supported ones are cdpx acr repos + tag: 'ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. + latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. diff --git a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml new file mode 100644 index 000000000..e0286fbd6 --- /dev/null +++ b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml @@ -0,0 +1,55 @@ +environment: + host: + os: 'windows' + flavor: 'server' + version: '2019' + runtime: + provider: 'appcontainer' + image: 'cdpxwin1809.azurecr.io/user/azure-monitor/container-insights:6.0' + source_mode: 'map' + +version: + name: 'Certificate Generator and Out OMS plugin' + major: 10 + minor: 0 + tag: 'beta' + system: 'custom' + exclude_commit: true + +signing_options: + profile: 'azure' + codesign_validation_glob_pattern: 'regex|.+(?:dll|exe|sys|ps1|psm1|ps1xml|psc1|psd1|cdxml|vbs|js|wsf)$;-:file|**\linux\**' #CSV does not currently support binaries built for linux, so we exclude this folder + +static_analysis_options: + binskim_options: + files_to_scan: + - from: 'build\windows\installer\certificategenerator\bin\' + exclude: # exclude binaries which are referenced via dotnet packages and not built by us + - '**/**/**/BouncyCastle.Crypto.dll' + - '**/**/**/**/BouncyCastle.Crypto.dll' +restore: + commands: + - !!defaultcommand + name: 'Restore dotnet packages' + command: '.pipelines/restore-windows.cmd' + +build: + commands: + - !!defaultcommand + name: 'Build Certificate Generator Source code and Out OMS Go plugin code' + command: '.pipelines/build-windows.cmd' + fail_on_stderr: false + +package: + commands: + - !!dockerbuildcommand # REQUIRED: This maps the command data to a concrete type in the CDPX orchestrator. + name: 'Build Docker Image' # REQUIRED: All commands have a name field. All console output captured when + # this command runs is tagged with the value of this field. + context_folder: 'kubernetes/windows' # REQUIRED: The repository root relative path of the folder containing the Dockerfile to build. + # In effect, the context folder will be repository_checkout_folder/src/DockerFinal. + dockerfile_name: 'Dockerfile' # OPTIONAL: The name of the dockerfile. Docker client does allow the Dockerfile + # to be named differently. Defaults to Dockerfile. + # In effect, the -f option value passed to docker build will be repository_checkout_folder/src/DockerFinal/Foo.dockerfile. + repository_name: 'cdpxwin1809' # only supported ones are cdpx acr repos + tag: 'win-ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. + latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. From df29e35c0b5d5a4bf73bb833f9939bda40ee0732 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 5 Aug 2020 17:45:14 -0700 Subject: [PATCH 002/194] re-enable adx path (#420) --- source/plugins/go/src/oms.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 88c5641f7..63ca6de10 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1323,9 +1323,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { ContainerLogsRouteV2 = true Log("Routing container logs thru %s route...", ContainerLogsV2Route) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsV2Route) - //} else if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { - //making dormant with below comparison for now -- - } else if strings.Compare("willnot", "match") == 0 { + } else if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { //check if adx clusteruri, clientid & secret are set var err error AdxClusterUri, err = ReadFileContents(PluginConfiguration["adx_cluster_uri_path"]) From bcc8506e4d4a1114307d3d13ad09111ada9c367e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 5 Aug 2020 18:17:12 -0700 Subject: [PATCH 003/194] Gangams/release changes (#419) * updates related to release * updates related to release * fix the incorrect version * fix pr feedback * fix some typos in the release notes --- README.md | 4 ++-- ReleaseNotes.md | 16 ++++++++++++++++ ReleaseProcess.md | 16 +++++++++++----- build/version | 4 ++-- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ kubernetes/windows/Dockerfile | 2 +- 9 files changed, 43 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 06d3606c0..659fe0161 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ powershell -ExecutionPolicy bypass # switch to powershell if you are not on pow # Azure DevOps Build Pipeline -Navigate to https://github-private.visualstudio.com/microsoft/_build?view=pipelines to see Linux and Windows Agent build pipelines. These pipelines are configured with CI triggers for ci_dev and ci_prod (TBD). +Navigate to https://github-private.visualstudio.com/microsoft/_build?view=pipelines to see Linux and Windows Agent build pipelines. These pipelines are configured with CI triggers for ci_dev and ci_prod. Docker Images will be pushed to CDPX ACR repos and these needs to retagged and pushed to corresponding ACR or docker hub. Only onboarded Azure AD AppId has permission to pull the images from CDPx ACRs. @@ -236,7 +236,7 @@ Here are the instructions to onboard the feature branch to Azure Dev Ops pipelin # Azure DevOps Release Pipeline -Integrated to Azure DevOps release pipeline for the ci_dev and ci_prod (TBD).With this, for every commit to ci_dev branch, latest bits automatically deployded to DEV AKS clusters in Build subscription and similarly for for every commit to ci_prod branch, latest bits automatically deployed to PROD AKS clusters in Build subscription. +Integrated to Azure DevOps release pipeline for the ci_dev and ci_prod.With this, for every commit to ci_dev branch, latest bits automatically deployded to DEV AKS clusters in Build subscription and similarly for for every commit to ci_prod branch, latest bits automatically deployed to PROD AKS clusters in Build subscription. For dev, agent image will be in this format mcr.microsoft.com/azuremonitor/containerinsights/cidev:cidev. For prod, agent will be in this format mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod`
`. diff --git a/ReleaseNotes.md b/ReleaseNotes.md index aa57d8388..933900b89 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,22 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 08/05/2020 - +##### Version microsoft/oms:ciprod08052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052020 (linux) +##### Version microsoft/oms:win-ciprod08052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08052020 (windows) +##### Code change log +- Collection of KubeState metrics for deployments and HPA +- Add the Proxy support for Windows agent +- Fix for ContainerState in ContainerInventory to handle Failed state and collection of environment variables for terminated and failed containers +- Change /spec to /metrics/cadvisor endpoint to collect node capacity metrics +- Disable Health Plugin by default and can enabled via configmap +- Pin version of jq to 1.5+dfsg-2 +- Bug fix for showing node as 'not ready' when there is disk pressure +- oneagent integration (disabled by default) +- Add region check before sending alertable metrics to MDM +- Telemetry fix for agent telemetry for sov. clouds + + ### 07/15/2020 - ##### Version microsoft/oms:ciprod07152020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07152020 (linux) ##### Version microsoft/oms:win-ciprod05262020-2 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod05262020-2 (windows) diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 38ff1ab69..5ec42d496 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -5,15 +5,21 @@ Here are the high-level instructions to get the CIPROD`
` image for the production release 1. create feature branch from ci_dev and make the following updates > Note: This required since Azure Dev Ops pipeline doesnt support --build-arg yet to automate this. - - Ensure IMAGE_TAG updated with release candiate image tag in the DockerFile under kubernetes/linux and kubernetes/windows directory - - Update omsagent.yaml if there are any changes to the yaml + - Ensure IMAGE_TAG updated with release candiate image tag in the DockerFile under kubernetes/linux and kubernetes/windows directory + - Update the version file under build directory with build version and date + - Update omsagent.yaml for the image tag and dockerProviderVersion, and any other changes + - Update the chart version and image tags in values.yaml under charts/azuremonitor-containers - Release notes 2. Make PR to ci_dev branch and once the PR approved, merge the changes to ci_dev 3. Latest bits of ci_dev automatically deployed to CIDEV cluster in build subscription so just validated E2E to make sure everthing works 4. If everything validated in DEV, make merge PR from ci_dev and ci_prod and merge once this reviewed by dev team -5. Merge ci_dev and ci_prod branch which will trigger automatic deployment of latest bits to CIPROD cluster with CIPROD`
` image (TBD) +6. Update following pipeline variables under ReleaseCandiate with version of chart and image tag + - CIHELMCHARTVERSION # For example, 2.7.4 + - CIImageTagSuffix # ciprod08052020 or ciprod08052020-1 etc. +7. Merge ci_dev and ci_prod branch which will trigger automatic deployment of latest bits to CIPROD cluster with CIPROD`
` image to test and scale cluters, AKS, AKS-Engine > Note: production image automatically pushed to CIPROD Public cloud ACR which will inturn replicated to Public cloud MCR. -6. Validate all the scenarios against CIPROD cluster in Build subscription +8. Validate all the scenarios against clusters in build subscription and scale clusters + # 2. Perf and scale testing @@ -27,7 +33,7 @@ Image automatically synched to MCR CN from Public cloud MCR. ## AKS -Make PR against [AKS-RP](https://msazure.visualstudio.com/CloudNativeCompute/_git/aks-rp?version=GBmaster) repo with chart update(s) +- Refer to internal docs for the release process and instructions. ## ARO v3 diff --git a/build/version b/build/version index b856fc312..f26973116 100644 --- a/build/version +++ b/build/version @@ -5,8 +5,8 @@ CONTAINER_BUILDVERSION_MAJOR=10 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 -CONTAINER_BUILDVERSION_BUILDNR=1 -CONTAINER_BUILDVERSION_DATE=20200526 +CONTAINER_BUILDVERSION_BUILDNR=4 +CONTAINER_BUILDVERSION_DATE=20200805 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 8a84692e7..202494152 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.7.3 +version: 2.7.4 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 685c767bb..927d24b35 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -7,10 +7,10 @@ omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod07152020" - tagWindows: "win-ciprod05262020-2" + tag: "ciprod08052020" + tagWindows: "win-ciprod08052020" pullPolicy: IfNotPresent - dockerProviderVersion: "10.0.0-3" + dockerProviderVersion: "10.0.0-4" agentVersion: "1.10.0.1" ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index c8b61995d..c82532471 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod07152020 +ARG IMAGE_TAG=ciprod08052020 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 038c7e92b..ac712722a 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -337,13 +337,13 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-3" + dockerProviderVersion: "10.0.0-4" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07152020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052020" imagePullPolicy: IfNotPresent resources: limits: @@ -480,13 +480,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-3" + dockerProviderVersion: "10.0.0-4" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07152020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052020" imagePullPolicy: IfNotPresent resources: limits: @@ -631,13 +631,13 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-2" + dockerProviderVersion: "10.0.0-4" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod05262020-2" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08052020" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 9a5e22e0d..0b81b9c71 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod05262020-2 +ARG IMAGE_TAG=win-ciprod08052020 SHELL ["powershell"] From 39534d6116ca5df1325768e681646b5d6010ea6b Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 5 Aug 2020 20:06:46 -0700 Subject: [PATCH 004/194] fix for zero filled metrics (#423) --- source/plugins/ruby/podinventory_to_mdm.rb | 98 +++++++++++----------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index dd5a15990..834515969 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -93,67 +93,71 @@ def initialize(custom_metrics_azure_regions) end def get_pod_inventory_mdm_records(batch_time) + records = [] begin - # generate all possible values of non_phase_dim_values X pod Phases and zero-fill the ones that are not already present - @no_phase_dim_values_hash.each { |key, value| - @@pod_phase_values.each { |phase| - pod_key = [key, phase].join("~~") - if !@pod_count_hash.key?(pod_key) - @pod_count_hash[pod_key] = 0 - else + if @process_incoming_stream + # generate all possible values of non_phase_dim_values X pod Phases and zero-fill the ones that are not already present + @no_phase_dim_values_hash.each { |key, value| + @@pod_phase_values.each { |phase| + pod_key = [key, phase].join("~~") + if !@pod_count_hash.key?(pod_key) + @pod_count_hash[pod_key] = 0 + else + next + end + } + } + @pod_count_hash.each { |key, value| + key_elements = key.split("~~") + if key_elements.length != 4 next end - } - } - records = [] - @pod_count_hash.each { |key, value| - key_elements = key.split("~~") - if key_elements.length != 4 - next - end - # get dimension values by key - podNodeDimValue = key_elements[0] - podNamespaceDimValue = key_elements[1] - podControllerNameDimValue = key_elements[2] - podPhaseDimValue = key_elements[3] + # get dimension values by key + podNodeDimValue = key_elements[0] + podNamespaceDimValue = key_elements[1] + podControllerNameDimValue = key_elements[2] + podPhaseDimValue = key_elements[3] - record = @@pod_inventory_custom_metrics_template % { - timestamp: batch_time, - metricName: @@pod_count_metric_name, - phaseDimValue: podPhaseDimValue, - namespaceDimValue: podNamespaceDimValue, - nodeDimValue: podNodeDimValue, - controllerNameDimValue: podControllerNameDimValue, - podCountMetricValue: value, + record = @@pod_inventory_custom_metrics_template % { + timestamp: batch_time, + metricName: @@pod_count_metric_name, + phaseDimValue: podPhaseDimValue, + namespaceDimValue: podNamespaceDimValue, + nodeDimValue: podNodeDimValue, + controllerNameDimValue: podControllerNameDimValue, + podCountMetricValue: value, + } + records.push(JSON.parse(record)) } - records.push(JSON.parse(record)) - } - #Add pod metric records - records = MdmMetricsGenerator.appendAllPodMetrics(records, batch_time) + #Add pod metric records + records = MdmMetricsGenerator.appendAllPodMetrics(records, batch_time) - #Send telemetry for pod metrics - timeDifference = (DateTime.now.to_time.to_i - @@metricTelemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - MdmMetricsGenerator.flushPodMdmMetricTelemetry - @@metricTelemetryTimeTracker = DateTime.now.to_time.to_i - end + #Send telemetry for pod metrics + timeDifference = (DateTime.now.to_time.to_i - @@metricTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + MdmMetricsGenerator.flushPodMdmMetricTelemetry + @@metricTelemetryTimeTracker = DateTime.now.to_time.to_i + end - # Clearing out all hashes after telemetry is flushed - MdmMetricsGenerator.clearPodHashes + # Clearing out all hashes after telemetry is flushed + MdmMetricsGenerator.clearPodHashes + end rescue Exception => e @log.info "Error processing pod inventory record Exception: #{e.class} Message: #{e.message}" ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) return [] end - @log.info "Pod Count To Phase #{@pod_count_by_phase} " - @log.info "resetting convertor state " - @pod_count_hash = {} - @no_phase_dim_values_hash = {} - @pod_count_by_phase = {} - @pod_uids = {} + if @process_incoming_stream + @log.info "Pod Count To Phase #{@pod_count_by_phase} " + @log.info "resetting convertor state " + @pod_count_hash = {} + @no_phase_dim_values_hash = {} + @pod_count_by_phase = {} + @pod_uids = {} + end return records end From 5e0b42909bc63886dbf5433545d921a8237ef1e0 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 7 Aug 2020 13:26:21 -0700 Subject: [PATCH 005/194] consolidate windows agent image docker files (#422) * consolidate windows agent image docker files * revert docker file consolidation * revert readme updates * merge back windows dockerfiles * image tag update --- ReleaseNotes.md | 6 ++--- ReleaseProcess.md | 2 +- charts/azuremonitor-containers/values.yaml | 4 +-- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 6 ++--- kubernetes/windows/Dockerfile | 31 ++++++++++++++++++++-- kubernetes/windows/baseimage/Dockerfile | 28 ------------------- 7 files changed, 39 insertions(+), 40 deletions(-) delete mode 100644 kubernetes/windows/baseimage/Dockerfile diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 933900b89..0f1d932a8 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,9 +11,9 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 08/05/2020 - -##### Version microsoft/oms:ciprod08052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052020 (linux) -##### Version microsoft/oms:win-ciprod08052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08052020 (windows) +### 08/07/2020 - +##### Version microsoft/oms:ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020 (linux) +##### Version microsoft/oms:win-ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08072020 (windows) ##### Code change log - Collection of KubeState metrics for deployments and HPA - Add the Proxy support for Windows agent diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 5ec42d496..19802e22c 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -15,7 +15,7 @@ Here are the high-level instructions to get the CIPROD`
` image for 4. If everything validated in DEV, make merge PR from ci_dev and ci_prod and merge once this reviewed by dev team 6. Update following pipeline variables under ReleaseCandiate with version of chart and image tag - CIHELMCHARTVERSION # For example, 2.7.4 - - CIImageTagSuffix # ciprod08052020 or ciprod08052020-1 etc. + - CIImageTagSuffix # ciprod08072020 or ciprod08072020-1 etc. 7. Merge ci_dev and ci_prod branch which will trigger automatic deployment of latest bits to CIPROD cluster with CIPROD`
` image to test and scale cluters, AKS, AKS-Engine > Note: production image automatically pushed to CIPROD Public cloud ACR which will inturn replicated to Public cloud MCR. 8. Validate all the scenarios against clusters in build subscription and scale clusters diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 927d24b35..610e109ef 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -7,8 +7,8 @@ omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod08052020" - tagWindows: "win-ciprod08052020" + tag: "ciprod08072020" + tagWindows: "win-ciprod08072020" pullPolicy: IfNotPresent dockerProviderVersion: "10.0.0-4" agentVersion: "1.10.0.1" diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index c82532471..bc27a5384 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod08052020 +ARG IMAGE_TAG=ciprod08072020 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index ac712722a..29533e678 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -343,7 +343,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020" imagePullPolicy: IfNotPresent resources: limits: @@ -486,7 +486,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020" imagePullPolicy: IfNotPresent resources: limits: @@ -637,7 +637,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08052020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08072020" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 0b81b9c71..a18404772 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -1,9 +1,36 @@ -FROM mcr.microsoft.com/azuremonitor/containerinsights/ciprod:winakslogbase-07022020 +FROM mcr.microsoft.com/windows/servercore:ltsc2019 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod08052020 +ARG IMAGE_TAG=win-ciprod08072020 + +# Do not split this into multiple RUN! +# Docker creates a layer for every RUN-Statement +RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" +# Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools +RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ +&& choco install -y msys2 --version 20190524.0.0.20191030 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y vim + +# gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update +RUN refreshenv \ +&& ridk install 3 \ +&& echo gem: --no-document >> C:\ProgramData\gemrc \ +&& gem install cool.io -v 1.5.4 --platform ruby \ +&& gem install oj -v 3.3.10 \ +&& gem install json -v 2.2.0 \ +&& gem install fluentd -v 1.10.2 \ +&& gem install win32-service -v 1.0.1 \ +&& gem install win32-ipc -v 0.7.0 \ +&& gem install win32-event -v 0.6.3 \ +&& gem install windows-pr -v 1.2.6 \ +&& gem install tomlrb -v 1.3.0 \ +&& gem install gyoku -v 1.3.1 \ +&& gem sources --clear-all + +# Remove gem cache and chocolatey +RUN powershell -Command "Remove-Item -Force C:\ruby26\lib\ruby\gems\2.6.0\cache\*.gem; Remove-Item -Recurse -Force 'C:\ProgramData\chocolatey'" SHELL ["powershell"] diff --git a/kubernetes/windows/baseimage/Dockerfile b/kubernetes/windows/baseimage/Dockerfile deleted file mode 100644 index 122daa9cc..000000000 --- a/kubernetes/windows/baseimage/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM mcr.microsoft.com/windows/servercore:ltsc2019 - -# Do not split this into multiple RUN! -# Docker creates a layer for every RUN-Statement -RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" - -# Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools -RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ -&& choco install -y msys2 --version 20190524.0.0.20191030 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ -&& choco install -y vim -RUN refreshenv \ -&& ridk install 2 3 \ -&& echo gem: --no-document >> C:\ProgramData\gemrc \ -&& gem install cool.io -v 1.5.4 --platform ruby \ -&& gem install oj -v 3.3.10 \ -&& gem install json -v 2.2.0 \ -&& gem install fluentd -v 1.10.2 \ -&& gem install win32-service -v 1.0.1 \ -&& gem install win32-ipc -v 0.7.0 \ -&& gem install win32-event -v 0.6.3 \ -&& gem install windows-pr -v 1.2.6 \ -&& gem install tomlrb -v 1.3.0 \ -&& gem install gyoku -v 1.3.1 \ -&& gem sources --clear-all - -# Remove gem cache and chocolatey -RUN powershell -Command "Remove-Item -Force C:\ruby26\lib\ruby\gems\2.6.0\cache\*.gem; Remove-Item -Recurse -Force 'C:\ProgramData\chocolatey'" - From c5c28f0dc4f89893aea4215c6fd5647b904c4c92 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 13 Aug 2020 11:00:19 -0700 Subject: [PATCH 006/194] Gangams/cluster creation scripts (#414) * onprem k8s script * script updates * scripts for creating non-aks clusters * fix minor text update * updates * script updates * fix * script updates * fix scripts to install docker --- scripts/cluster-creation/README.md | 45 +++++ scripts/cluster-creation/aks-engine.sh | 163 +++++++++++++++++ scripts/cluster-creation/arc-k8s-cluster.sh | 190 ++++++++++++++++++++ scripts/cluster-creation/aro-v4.sh | 146 +++++++++++++++ scripts/cluster-creation/onprem-k8s.sh | 106 +++++++++++ 5 files changed, 650 insertions(+) create mode 100644 scripts/cluster-creation/README.md create mode 100644 scripts/cluster-creation/aks-engine.sh create mode 100644 scripts/cluster-creation/arc-k8s-cluster.sh create mode 100644 scripts/cluster-creation/aro-v4.sh create mode 100755 scripts/cluster-creation/onprem-k8s.sh diff --git a/scripts/cluster-creation/README.md b/scripts/cluster-creation/README.md new file mode 100644 index 000000000..57d0c5dbf --- /dev/null +++ b/scripts/cluster-creation/README.md @@ -0,0 +1,45 @@ +# Instructions to create k8s clusters + +## On-Prem K8s Cluster + +on-prem k8s cluster can be created on any VM or physical machine using kind. + +``` +bash onprem-k8s.sh --cluster-name +``` + +## AKS-Engine cluster + +aks-engine is unmanaged cluster in azure and you can use below command to create the cluster in azure. + +``` + +# Either you can reuse existing service principal or create one with below instructions +subscriptionId="" +az account set -s ${subscriptionId} +sp=$(az ad sp create-for-rbac --role="Contributor" --scopes="/subscriptions/${subscriptionId}") +# get the appId (i.e. clientid) and password (i.e. clientSecret) +echo $sp + +clientId=$(echo $sp | jq '.appId') +clientSecret=$(echo $sp | jq '.password') + +# create the aks-engine +bash aks-engine.sh --subscription-id "" --client-id "" --client-secret "" --dns-prefix "" --location "" +``` + +## ARO v4 Cluster + +Azure Redhat Openshift v4 cluster can be created with below command. + +> Note: Because of the cleanup policy on internal subscriptions, cluster creation can fail if you dont change cleanup service to none on the subnets of aro vnet before creation. +``` +bash aro-v4.sh --subscription-id "" --resource-group "" --cluster-name "" --location "" +``` +## Azure Arc K8s cluster + +you can connect on-prem k8s cluster or unmanaged k8s cluster such as aks-engine to azure through azure arc. + +``` +bash arc-k8s-cluster.sh --subscription-id "" --resource-group "" --cluster-name "" --location "" --kube-context "" +``` diff --git a/scripts/cluster-creation/aks-engine.sh b/scripts/cluster-creation/aks-engine.sh new file mode 100644 index 000000000..9d287ea07 --- /dev/null +++ b/scripts/cluster-creation/aks-engine.sh @@ -0,0 +1,163 @@ +#!/bin/bash +set -e +TEMP_DIR=temp-$RANDOM +DEFAULT_ONPREM_K8S_CLUSTER="aks-engine-k8s-test" +AKS_ENGINE_VERSION="v0.54.0" + +download-aks-engine() +{ + sudo curl -LO https://github.com/Azure/aks-engine/releases/download/${AKS_ENGINE_VERSION}/aks-engine-v0.54.0-linux-amd64.tar.gz + sudo tar -xvf aks-engine-${AKS_ENGINE_VERSION}-linux-amd64.tar.gz + sudo mv aks-engine-${AKS_ENGINE_VERSION}-linux-amd64 aks-engine + sudo mv -f aks-engine/aks-engine /usr/local/bin +} + + +usage() +{ + local basename=`basename $0` + echo + echo "create aks-engine cluster:" + echo "$basename deploy --subscription-id --client-id --client-secret --dns-prefix --location " +} + +parse_args() +{ + + if [ $# -le 1 ] + then + usage + exit 1 + fi + +# Transform long options to short ones +for arg in "$@"; do + shift + case "$arg" in + "--subscription-id") set -- "$@" "-s" ;; + "--client-id") set -- "$@" "-c" ;; + "--client-secret") set -- "$@" "-w" ;; + "--dns-prefix") set -- "$@" "-d" ;; + "--location") set -- "$@" "-l" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" + esac +done + +local OPTIND opt + +while getopts 'hs:c:w:d:l:' opt; do + case "$opt" in + h) + usage + ;; + + s) + subscriptionId="$OPTARG" + echo "subscriptionId is $OPTARG" + ;; + + c) + clientId="$OPTARG" + echo "clientId is $OPTARG" + ;; + + w) + clientSecret="$OPTARG" + echo "clientSecret is $OPTARG" + ;; + + d) + dnsPrefix="$OPTARG" + echo "dnsPrefix is $OPTARG" + ;; + + l) + location="$OPTARG" + echo "location is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND -1))" + + +} +create_cluster() +{ + +sudo touch kubernetes.json +sudo chmod 777 kubernetes.json +# For docker runtime, remove kubernetesConfig block +cat >> kubernetes.json < --resource-group --cluster-name --location --kube-context " +} + +parse_args() +{ + + if [ $# -le 1 ] + then + usage + exit 1 + fi + +# Transform long options to short ones +for arg in "$@"; do + shift + case "$arg" in + "--subscription-id") set -- "$@" "-s" ;; + "--resource-group") set -- "$@" "-r" ;; + "--cluster-name") set -- "$@" "-c" ;; + "--location") set -- "$@" "-l" ;; + "--kube-context") set -- "$@" "-k" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" + esac +done + +local OPTIND opt + +while getopts 'hs:r:c:l:k:' opt; do + case "$opt" in + h) + usage + ;; + + s) + subscriptionId="$OPTARG" + echo "subscriptionId is $OPTARG" + ;; + + r) + resourceGroupName="$OPTARG" + echo "resourceGroupName is $OPTARG" + ;; + + c) + clusterName="$OPTARG" + echo "clusterName is $OPTARG" + ;; + + l) + location="$OPTARG" + echo "location is $OPTARG" + ;; + + k) + kubecontext="$OPTARG" + echo "kubecontext is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND -1))" + + +} + +connect_azure_arc_k8s() +{ + + echo "create resource group: ${resourceGroupName} if it doenst exist" + isrgExists=$(az group exists -g ${resourceGroupName}) + if $isrgExists; then + echo "resource group: ${resourceGroupName} already exists" + else + echo "creating resource group ${resourceGroupName} in region since it doesnt exist" + az group create -l ${location} -n ${resourceGroupName} + fi + + echo "connecting k8s cluster with kube-context : ${kubecontext} to azure with clustername: ${clusterName} and resourcegroup: ${resourceGroupName} ..." + az connectedk8s connect --name ${clusterName} --resource-group ${resourceGroupName} + echo "connecting k8s cluster with kube-context : ${kubecontext} to azure with clustername: ${clusterName} and resourcegroup: ${resourceGroupName} completed." +} + + + +echo "connecting k8s cluster to azure arc..." +echo "HELM version: ${HELM_VERSION}" +cd ~ +echo "creating temp directory":$TEMP_DIR +sudo mkdir $TEMP_DIR && cd $TEMP_DIR + +echo "validate args" +parse_args $@ + +echo "set the ${DefaultCloud} for azure cli" +az cloud set -n $DefaultCloud + +echo "login to azure cli" +az login --use-device-code + +echo "set the subscription ${subscriptionId} for cli" +az account set -s $subscriptionId + +echo "installing helm client ..." +install-helm +echo "installing helm client completed." + +echo "installing azure cli ..." +download-and-install-azure-cli +echo "installing azure cli completed." + +echo "installing arc k8s extensions and pre-requisistes ..." +install_arc_k8s_prerequisites +echo "installing arc k8s extensions and pre-requisites completed." + +echo "connecting cluster to azure arc k8s via azure arc " +connect_azure_arc_k8s +echo "connecting cluster to azure arc k8s via azure arc completed." + +echo "connecting k8s cluster to azure arc completed." diff --git a/scripts/cluster-creation/aro-v4.sh b/scripts/cluster-creation/aro-v4.sh new file mode 100644 index 000000000..8540ae931 --- /dev/null +++ b/scripts/cluster-creation/aro-v4.sh @@ -0,0 +1,146 @@ +#!/bin/bash +set -e +TEMP_DIR=temp-$RANDOM +DefaultCloud="AzureCloud" +DefaultVnetName="aro-net" +DefaultMasterSubnetName="master-subnet" +DefaultWorkerSubnetName="worker-subnet" + +download-and-install-azure-cli() +{ + # https://docs.microsoft.com/en-us/cli/azure/install-azure-cli-apt?view=azure-cli-latest#install-with-one-command + sudo curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash +} + +register_aro_v4_provider() +{ + echo "register Microsoft.RedHatOpenShift provider" + az provider register -n Microsoft.RedHatOpenShift --wait +} + +usage() +{ + local basename=`basename $0` + echo + echo "create aro v4 cluster:" + echo "$basename --subscription-id --resource-group --cluster-name --location " +} + +parse_args() +{ + + if [ $# -le 1 ] + then + usage + exit 1 + fi + +# Transform long options to short ones +for arg in "$@"; do + shift + case "$arg" in + "--subscription-id") set -- "$@" "-s" ;; + "--resource-group") set -- "$@" "-r" ;; + "--cluster-name") set -- "$@" "-c" ;; + "--location") set -- "$@" "-l" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" + esac +done + +local OPTIND opt + +while getopts 'hs:r:c:l:' opt; do + case "$opt" in + h) + usage + ;; + + s) + subscriptionId="$OPTARG" + echo "subscriptionId is $OPTARG" + ;; + + r) + resourceGroupName="$OPTARG" + echo "resourceGroupName is $OPTARG" + ;; + + c) + clusterName="$OPTARG" + echo "clusterName is $OPTARG" + ;; + + l) + location="$OPTARG" + echo "location is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND -1))" +} + +create_aro_v4_cluster() +{ + + echo "create resource group: ${resourceGroupName} if it doenst exist" + isrgExists=$(az group exists -g ${resourceGroupName}) + if $isrgExists; then + echo "resource group: ${resourceGroupName} already exists" + else + echo "creating resource group ${resourceGroupName} in region since it doesnt exist" + az group create -l ${location} -n ${resourceGroupName} + fi + + echo "creating virtual network" + az network vnet create --resource-group ${resourceGroupName} --name ${DefaultVnetName} --address-prefixes 10.0.0.0/22 + + echo "adding empty subnet for master nodes" + az network vnet subnet create --resource-group ${resourceGroupName} --vnet-name ${DefaultVnetName} --name ${DefaultMasterSubnetName} --address-prefixes 10.0.0.0/23 --service-endpoints Microsoft.ContainerRegistry + + echo "adding empty subnet for worker nodes" + az network vnet subnet create --resource-group ${resourceGroupName} --vnet-name ${DefaultVnetName} --name ${DefaultWorkerSubnetName} --address-prefixes 10.0.2.0/23 --service-endpoints Microsoft.ContainerRegistry + + echo "Please make sure disable to diable cleanup service on subnet nsgs of aor vnet for internal subscriptions" + sleep 1m + + echo "Disable subnet private endpoint policies on the master subnet" + az network vnet subnet update --name ${DefaultMasterSubnetName} --resource-group ${resourceGroupName} --vnet-name ${DefaultVnetName} --disable-private-link-service-network-policies true + + echo "creating ARO v4 cluster" + az aro create --resource-group ${resourceGroupName} --name ${clusterName} --vnet ${DefaultVnetName} --master-subnet ${DefaultMasterSubnetName} --worker-subnet ${DefaultWorkerSubnetName} + +} + + +echo "creating aro v4 cluster in specified azure subscription and resource group..." +cd ~ +echo "creating temp directory":$TEMP_DIR +sudo mkdir $TEMP_DIR && cd $TEMP_DIR + +echo "validate args" +parse_args $@ + +echo "set the ${DefaultCloud} for azure cli" +az cloud set -n $DefaultCloud + +echo "login to azure cli" +az login --use-device-code + +echo "set the subscription ${subscriptionId} for cli" +az account set -s $subscriptionId + +echo "installing azure cli ..." +download-and-install-azure-cli +echo "installing azure cli completed." + +echo "creating aro v4 cluster ..." +create_aro_v4_cluster +echo "creating aro v4 cluster completed." + +echo "creating aro v4 cluster in specified azure subscription and resource completed." diff --git a/scripts/cluster-creation/onprem-k8s.sh b/scripts/cluster-creation/onprem-k8s.sh new file mode 100755 index 000000000..147681133 --- /dev/null +++ b/scripts/cluster-creation/onprem-k8s.sh @@ -0,0 +1,106 @@ +#!/bin/bash +set -e +TEMP_DIR=temp-$RANDOM +KIND_VERSION="v0.8.1" + +install-kind() +{ +sudo curl -Lo ./kind https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-amd64 +sudo chmod +x ./kind +sudo mv ./kind /usr/local/bin/kind +} + +download_install_docker() +{ + echo "download docker script" + sudo curl -L https://get.docker.com/ -o get-docker.sh + echo "installing docker script" + sudo sh get-docker.sh + + echo "add user to docker group" + sudo usermod -aG docker $USER + +} + +create_cluster() +{ +sudo touch kind-config.yaml +sudo chmod 777 kind-config.yaml +cat >> kind-config.yaml < " +} + +parse_args() +{ + + if [ $# -le 1 ] + then + usage + exit 1 + fi + +# Transform long options to short ones +for arg in "$@"; do + shift + case "$arg" in + "--cluster-name") set -- "$@" "-c" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" + esac +done + +local OPTIND opt + +while getopts 'hc:' opt; do + case "$opt" in + h) + usage + ;; + + c) + clusterName="$OPTARG" + echo "clusterName is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND -1))" +} + +echo "creating kind k8 cluster ..." +echo "KIND version: ${KIND_VERSION}" +cd ~ +echo "creating temp directory":$TEMP_DIR +sudo mkdir $TEMP_DIR && cd $TEMP_DIR + +echo "parsing args" +parse_args $@ + +echo "download and install docker" +download_install_docker + +echo "download and install kind" +install-kind + +echo "creating cluster: ${clusterName}" +create_cluster + +echo "creating kind k8 cluster completed." From d7a3750107e6c8778f13dccb8d20767348a68292 Mon Sep 17 00:00:00 2001 From: bragi92 Date: Fri, 14 Aug 2020 13:00:30 -0700 Subject: [PATCH 007/194] fix: Pin to a particular version of ltsc2019 by SHA (#427) --- kubernetes/windows/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index a18404772..c8162b539 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/windows/servercore:ltsc2019 +FROM mcr.microsoft.com/windows/servercore@sha256:921bed01c2a023310bdbaa288edebd82c4910e536ff206b87e9cbe703ca27505 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" From 5e8de91534c59a9bff4d786f2085195dca67392d Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Fri, 14 Aug 2020 14:17:53 -0700 Subject: [PATCH 008/194] enable collecting npm metrics (optionally) (#425) * enable collecting npm metrics (optionally) * fix default enrichment value * fix adx --- build/linux/installer/conf/telegraf-rs.conf | 42 +++++++ build/linux/installer/conf/telegraf.conf | 41 +++++++ .../installer/datafiles/base_container.data | 1 + .../scripts/tomlparser-npm-config.rb | 113 ++++++++++++++++++ kubernetes/container-azm-ms-agentconfig.yaml | 4 + kubernetes/linux/main.sh | 11 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 10 +- 7 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 build/linux/installer/scripts/tomlparser-npm-config.rb diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index f1e9cc282..3f2f65cff 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -611,3 +611,45 @@ $AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER # Computer = "placeholder_hostname" # ControllerType = "$CONTROLLER_TYPE" +##npm +[[inputs.prometheus]] + #name_prefix="container.azm.ms/" + ## An array of urls to scrape metrics from. + urls = ["$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER"] + fielddrop = ["$AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER"] + + metric_version = 2 + url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + #[inputs.prometheus.tagpass] + # operation_type = ["create_container", "remove_container", "pull_image"] + diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index b554dd4b3..19b6058be 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -703,6 +703,47 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] +##npm +[[inputs.prometheus]] + #name_prefix="container.azm.ms/" + ## An array of urls to scrape metrics from. + urls = ["$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE"] + + metric_version = 2 + url_tag = "scrapeUrl" + + ## An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] + + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + # monitor_kubernetes_pods = true + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] + #[inputs.prometheus.tagpass] + # operation_type = ["create_container", "remove_container", "pull_image"] + # [[inputs.exec]] # ## Commands array # interval = "15m" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index f07e71b2d..fc5a6c8bc 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -125,6 +125,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root +/opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root /opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root diff --git a/build/linux/installer/scripts/tomlparser-npm-config.rb b/build/linux/installer/scripts/tomlparser-npm-config.rb new file mode 100644 index 000000000..c5953836b --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-npm-config.rb @@ -0,0 +1,113 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + +require_relative "ConfigParseErrorLogger" + +@configMapMountPath = "/etc/config/settings/integrations" +@configSchemaVersion = "" +@collect_basic_npm_metrics = false +@collect_advanced_npm_metrics = false +@npm_node_url="http://$NODE_IP:10091/node-metrics" +@npm_cluster_url="http://npm-metrics-cluster-service.kube-system:9000/cluster-metrics" +@npm_basic_drop_metrics_cluster = "npm_ipset_counts" + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for npm metrics found, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map for npm metrics" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for npm metrics not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for npm metrics: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && !parsedConfig[:integrations].nil? && !parsedConfig[:integrations][:azure_network_policy_manager].nil? && !parsedConfig[:integrations][:azure_network_policy_manager][:collect_advanced_metrics].nil? + advanced_npm_metrics = parsedConfig[:integrations][:azure_network_policy_manager][:collect_advanced_metrics].to_s + puts "got:integrations.azure_network_policy_manager.collect_advanced_metrics='#{advanced_npm_metrics}'" + if !advanced_npm_metrics.nil? && advanced_npm_metrics.strip.casecmp("true") == 0 + @collect_advanced_npm_metrics = true + else + @collect_advanced_npm_metrics = false + end + puts "set:integrations.azure_network_policy_manager.collect_advanced_metrics=#{@collect_advanced_npm_metrics}" + end + rescue => errorStr + puts "config::error:Exception while reading config settings for npm advanced setting - #{errorStr}, using defaults" + @collect_advanced_npm_metrics = false + end + begin + if !parsedConfig.nil? && !parsedConfig[:integrations].nil? && !parsedConfig[:integrations][:azure_network_policy_manager].nil? && !parsedConfig[:integrations][:azure_network_policy_manager][:collect_basic_metrics].nil? + basic_npm_metrics = parsedConfig[:integrations][:azure_network_policy_manager][:collect_basic_metrics].to_s + puts "got:integrations.azure_network_policy_manager.collect_basic_metrics='#{basic_npm_metrics}'" + if !basic_npm_metrics.nil? && basic_npm_metrics.strip.casecmp("true") == 0 + @collect_basic_npm_metrics = true + else + @collect_basic_npm_metrics = false + end + puts "set:integrations.azure_network_policy_manager.collect_basic_metrics=#{@collect_basic_npm_metrics}" + end + rescue => errorStr + puts "config::error:Exception while reading config settings for npm basic setting - #{errorStr}, using defaults" + @collect_basic_npm_metrics = false + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end + @collect_basic_npm_metrics = false + @collect_advanced_npm_metrics = false +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("integration_npm_config_env_var", "w") + +if !file.nil? + if @collect_advanced_npm_metrics == true + file.write("export TELEMETRY_NPM_INTEGRATION_METRICS_ADVANCED=1\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE=#{@npm_node_url}\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER=#{@npm_cluster_url}\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER=\n") + elsif @collect_basic_npm_metrics == true + file.write("export TELEMETRY_NPM_INTEGRATION_METRICS_BASIC=1\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE=#{@npm_node_url}\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER=#{@npm_cluster_url}\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER=#{@npm_basic_drop_metrics_cluster}\n") + else + file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE=\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER=\n") + file.write("export AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER=\n") + end + # Close file after writing all environment variables + file.close +else + puts "Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" +end \ No newline at end of file diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index f3f442608..58e09f041 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -100,6 +100,10 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 + integrations: |- + [integrations.azure_network_policy_manager] + collect_basic_metrics = false + collect_advanced_metrics = false metadata: name: container-azm-ms-agentconfig namespace: kube-system diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 92f4977d6..311470660 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -160,7 +160,7 @@ done source config_env_var -#Parse the configmap to set the right environment variables. +#Parse the configmap to set the right environment variables for health feature. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-health-config.rb cat health_config_env_var | while read line; do @@ -169,6 +169,15 @@ cat health_config_env_var | while read line; do done source health_config_env_var +#Parse the configmap to set the right environment variables for network policy manager (npm) integration. +/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb + +cat integration_npm_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc +done +source integration_npm_config_env_var + #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset if [ ! -e "/etc/config/kube.conf" ]; then /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 42ecfcaf0..13796cd1e 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -33,6 +33,8 @@ class CAdvisorMetricsAPIClient @cAdvisorMetricsSecurePort = ENV["IS_SECURE_CADVISOR_PORT"] @containerLogsRoute = ENV["AZMON_CONTAINER_LOGS_ROUTE"] @hmEnabled = ENV["AZMON_CLUSTER_ENABLE_HEALTH_MODEL"] + @npmIntegrationBasic = ENV["TELEMETRY_NPM_INTEGRATION_METRICS_BASIC"] + @npmIntegrationAdvanced = ENV["TELEMETRY_NPM_INTEGRATION_METRICS_ADVANCED"] @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@ -250,7 +252,13 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met #telemetry about health model if (!@hmEnabled.nil? && !@hmEnabled.empty?) telemetryProps["hmEnabled"] = @hmEnabled - end + end + #telemetry for npm integration + if (!@npmIntegrationAdvanced.nil? && !@npmIntegrationAdvanced.empty?) + telemetryProps["int-npm-a"] = "1" + elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) + telemetryProps["int-npm-b"] = "1" + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end From 17e7ff8bf65c6fd3ab2dc2b47043249055e2dc3d Mon Sep 17 00:00:00 2001 From: saaror <31900410+saaror@users.noreply.github.com> Date: Mon, 17 Aug 2020 00:56:26 -0700 Subject: [PATCH 009/194] Saaror patch 3 (#426) * Create README.MD Creating content for Kubecon lab * Update README.MD * Update README.MD --- Kubecon/README.MD | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 Kubecon/README.MD diff --git a/Kubecon/README.MD b/Kubecon/README.MD new file mode 100644 index 000000000..873cfaf9a --- /dev/null +++ b/Kubecon/README.MD @@ -0,0 +1,36 @@ +# Kubecon Azure Monitor for containers lab + +## Overview + +### This Azure Monitor for containers lab will give you hands on experience to monitor AKS workloads. In this lab you will be working Azure Monitor, Log Analytics and Azure Monitor for Container Insights. + +## Instructions for lab + +1. Set-up environment [Setup Guide](https://github.com/rkuehfus/pre-ready-2019-H1/blob/master/Student/Guides/Deployment%20Setup%20Guide.docx?raw=true) + +2. Tasks for the lab + * From your Visual Studio Server, deploy the eShoponWeb application to AKS using Dev Spaces + * From Azure Monitor, locate the container running the eShoponWeb application + * Generate an exception in the eShoponWeb application(Hint: Try to change your password) + * Optimize the Azure Monitor for contains ingestion cost by fine tuning log-collection parameters like std-out/std-error, namespace. + +## Outcome + +### Understand Azure Monitor capabilities, facilitate an Azure Monitor customer conversation, and demo key features of Azure Monitor. + +## Target Audience + +This content has been targeted to devops/SRE intended to build their knowledge on Azure Monitor also for people that have a passion around Monitoring are more than welcome to attend. + +## Prerequisites + 1. Please review the following content before the event + a. [Azure Monitor for containers Overview](https://docs.microsoft.com/azure/azure-monitor/insights/container-insights-overview) + b. [Optimize Azure Monitor for containers cost ](https://medium.com/microsoftazure/azure-monitor-for-containers-optimizing-data-collection-settings-for-cost-ce6f848aca32) + +2. Attendees have access to an Azure Subscription where they can each deploy the provided ARM template that will build a very detailed infrastructure to monitor. This includes the Vnet, subnets, NSG(s), LB(s), NAT rules, scales set and a fully functional .NET Core Application (eShopOnWeb) to monitor. +3. Attendees should have a level 200-300 understanding of the Azure platform. Understand concepts like PowerShell, Azure Cli, ARM, resource groups, RBAC, network, storage, compute, scale sets, virtual machines and security. Previous experience working with ARM templates is recommended. +4. Access to a machine with Visual Studio Code and the Azure PowerShell Modules loaded or Azure CLI. VS Code ARM and PowerShell extensions should be configured. + +![alt text](https://raw.githubusercontent.com/rkuehfus/pre-ready-2019-H1/master/monitoringhackdiagram.png) + + From 6c7c6757b8c8cc87eaa89516393788d3d942857b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 18 Aug 2020 11:53:59 -0700 Subject: [PATCH 010/194] Gangams/add containerd support to windows agent (#428) * wip * wip * wip * wip * bug fix related to uri * wip * wip * fix bug with ignore cert validation * logic to ignore cert validation * minor * fix minor debug log issue * improve log message * debug message * fix bug with nullorempty check * remove debug statements * refactor parsers * add debug message * clean up * chart updates * fix formatting issues --- .../installer/conf/fluent-cri-parser.conf | 6 + .../installer/conf/fluent-docker-parser.conf | 5 + build/windows/installer/conf/fluent.conf | 32 ++- .../templates/omsagent-daemonset-windows.yaml | 7 + kubernetes/omsagent.yaml | 4 + kubernetes/windows/Dockerfile | 3 + kubernetes/windows/main.ps1 | 199 ++++++++++++++---- 7 files changed, 198 insertions(+), 58 deletions(-) create mode 100644 build/windows/installer/conf/fluent-cri-parser.conf create mode 100644 build/windows/installer/conf/fluent-docker-parser.conf diff --git a/build/windows/installer/conf/fluent-cri-parser.conf b/build/windows/installer/conf/fluent-cri-parser.conf new file mode 100644 index 000000000..86f1572ca --- /dev/null +++ b/build/windows/installer/conf/fluent-cri-parser.conf @@ -0,0 +1,6 @@ + + @type regexp + expression ^(? diff --git a/build/windows/installer/conf/fluent-docker-parser.conf b/build/windows/installer/conf/fluent-docker-parser.conf new file mode 100644 index 000000000..9dc800aeb --- /dev/null +++ b/build/windows/installer/conf/fluent-docker-parser.conf @@ -0,0 +1,5 @@ + + @type json + time_format %Y-%m-%dT%H:%M:%S.%NZ + keep_time_key true + diff --git a/build/windows/installer/conf/fluent.conf b/build/windows/installer/conf/fluent.conf index a4cacbcf6..c96300b1e 100644 --- a/build/windows/installer/conf/fluent.conf +++ b/build/windows/installer/conf/fluent.conf @@ -12,11 +12,8 @@ @log_level trace path_key tailed_path limit_recently_modified 5m - - @type json - time_format %Y-%m-%dT%H:%M:%S.%NZ - keep_time_key true - + # if the container runtime is non docker then this will be updated to fluent-cri-parser.conf during container startup + @include fluent-docker-parser.conf @@ -27,11 +24,8 @@ @log_level trace path_key tailed_path read_from_head true - - @type json - time_format %Y-%m-%dT%H:%M:%S.%NZ - keep_time_key true - + # if the container runtime is non docker then this will be updated to fluent-cri-parser.conf during container startup + @include fluent-docker-parser.conf @@ -59,13 +53,13 @@ - overflow_action throw_exception - chunk_limit_size 32k - queued_chunks_limit_size 256 - flush_interval 1 - flush_thread_interval 0.5 - flush_thread_burst_interval 0.01 - flush_thread_count 4 - retry_forever true - + overflow_action throw_exception + chunk_limit_size 32k + queued_chunks_limit_size 256 + flush_interval 1 + flush_thread_interval 0.5 + flush_thread_burst_interval 0.01 + flush_thread_count 4 + retry_forever true + diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 0ea7a9af6..b8e667398 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -53,6 +53,13 @@ spec: - name: CONTROLLER_TYPE value: "DaemonSet" - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 29533e678..db788a37e 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -660,6 +660,10 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index c8162b539..06e11e73a 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -56,6 +56,9 @@ COPY ./omsagentwindows/out_oms.so /opt/omsagentwindows/out_oms.so # copy fluent, fluent-bit and out_oms conf files COPY ./omsagentwindows/installer/conf/fluent.conf /etc/fluent/ +# copy fluent docker and cri parser conf files +COPY ./omsagentwindows/installer/conf/fluent-cri-parser.conf /etc/fluent/ +COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index b7ddfa8e7..de82722ad 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -1,34 +1,51 @@ -function Confirm-WindowsServiceExists($name) -{ - if (Get-Service $name -ErrorAction SilentlyContinue) +Add-Type @" + using System; + using System.Net; + using System.Net.Security; + using System.Security.Cryptography.X509Certificates; + public class ServerCertificateValidationCallback { + public static void Ignore() + { + ServicePointManager.ServerCertificateValidationCallback += + delegate + ( + Object obj, + X509Certificate certificate, + X509Chain chain, + SslPolicyErrors errors + ) + { + return true; + }; + } + } +"@ +function Confirm-WindowsServiceExists($name) { + if (Get-Service $name -ErrorAction SilentlyContinue) { return $true } return $false } -function Remove-WindowsServiceIfItExists($name) -{ +function Remove-WindowsServiceIfItExists($name) { $exists = Confirm-WindowsServiceExists $name - if ($exists) - { + if ($exists) { sc.exe \\server delete $name } } -function Start-FileSystemWatcher -{ +function Start-FileSystemWatcher { Start-Process powershell -NoNewWindow .\filesystemwatcher.ps1 } #register fluentd as a windows service -function Set-EnvironmentVariables -{ +function Set-EnvironmentVariables { $domain = "opinsights.azure.com" if (Test-Path /etc/omsagent-secret/DOMAIN) { # TODO: Change to omsagent-secret before merging - $domain = Get-Content /etc/omsagent-secret/DOMAIN + $domain = Get-Content /etc/omsagent-secret/DOMAIN } # Set DOMAIN @@ -38,7 +55,7 @@ function Set-EnvironmentVariables $wsID = "" if (Test-Path /etc/omsagent-secret/WSID) { # TODO: Change to omsagent-secret before merging - $wsID = Get-Content /etc/omsagent-secret/WSID + $wsID = Get-Content /etc/omsagent-secret/WSID } # Set DOMAIN @@ -48,7 +65,7 @@ function Set-EnvironmentVariables $wsKey = "" if (Test-Path /etc/omsagent-secret/KEY) { # TODO: Change to omsagent-secret before merging - $wsKey = Get-Content /etc/omsagent-secret/KEY + $wsKey = Get-Content /etc/omsagent-secret/KEY } # Set KEY @@ -58,7 +75,7 @@ function Set-EnvironmentVariables $proxy = "" if (Test-Path /etc/omsagent-secret/PROXY) { # TODO: Change to omsagent-secret before merging - $proxy = Get-Content /etc/omsagent-secret/PROXY + $proxy = Get-Content /etc/omsagent-secret/PROXY Write-Host "Validating the proxy configuration since proxy configuration provided" # valide the proxy endpoint configuration if (![string]::IsNullOrEmpty($proxy)) { @@ -66,26 +83,22 @@ function Set-EnvironmentVariables if (![string]::IsNullOrEmpty($proxy)) { $proxy = [string]$proxy.Trim(); $parts = $proxy -split "@" - if ($parts.Length -ne 2) - { + if ($parts.Length -ne 2) { Write-Host "Invalid ProxyConfiguration $($proxy). EXITING....." exit 1 } $subparts1 = $parts[0] -split "//" - if ($subparts1.Length -ne 2) - { + if ($subparts1.Length -ne 2) { Write-Host "Invalid ProxyConfiguration $($proxy). EXITING....." exit 1 } $protocol = $subparts1[0].ToLower().TrimEnd(":") - if (!($protocol -eq "http") -and !($protocol -eq "https")) - { + if (!($protocol -eq "http") -and !($protocol -eq "https")) { Write-Host "Unsupported protocol in ProxyConfiguration $($proxy). EXITING....." exit 1 } $subparts2 = $parts[1] -split ":" - if ($subparts2.Length -ne 2) - { + if ($subparts2.Length -ne 2) { Write-Host "Invalid ProxyConfiguration $($proxy). EXITING....." exit 1 } @@ -118,46 +131,154 @@ function Set-EnvironmentVariables .\setenv.ps1 } -function Start-Fluent -{ +function Get-ContainerRuntime { + # default container runtime and make default as containerd when containerd becomes default in AKS + $containerRuntime = "docker" + $response = "" + $NODE_IP = "" + try { + if (![string]::IsNullOrEmpty([System.Environment]::GetEnvironmentVariable("NODE_IP", "PROCESS"))) { + $NODE_IP = [System.Environment]::GetEnvironmentVariable("NODE_IP", "PROCESS") + } + elseif (![string]::IsNullOrEmpty([System.Environment]::GetEnvironmentVariable("NODE_IP", "USER"))) { + $NODE_IP = [System.Environment]::GetEnvironmentVariable("NODE_IP", "USER") + } + elseif (![string]::IsNullOrEmpty([System.Environment]::GetEnvironmentVariable("NODE_IP", "MACHINE"))) { + $NODE_IP = [System.Environment]::GetEnvironmentVariable("NODE_IP", "MACHINE") + } + + if (![string]::IsNullOrEmpty($NODE_IP)) { + $isPodsAPISuccess = $false + Write-Host "Value of NODE_IP environment variable : $($NODE_IP)" + try { + Write-Host "Making API call to http://$($NODE_IP):10255/pods" + $response = Invoke-WebRequest -uri http://$($NODE_IP):10255/pods -UseBasicParsing + Write-Host "Response status code of API call to http://$($NODE_IP):10255/pods : $($response.StatusCode)" + } + catch { + Write-Host "API call to http://$($NODE_IP):10255/pods failed" + } + + if (![string]::IsNullOrEmpty($response) -and $response.StatusCode -eq 200) { + Write-Host "API call to http://$($NODE_IP):10255/pods succeeded" + $isPodsAPISuccess = $true + } + else { + try { + Write-Host "Making API call to https://$($NODE_IP):10250/pods" + # ignore certificate validation since kubelet uses self-signed cert + [ServerCertificateValidationCallback]::Ignore() + $response = Invoke-WebRequest -Uri https://$($NODE_IP):10250/pods -Headers @{'Authorization' = "Bearer $(Get-Content /var/run/secrets/kubernetes.io/serviceaccount/token)" } -UseBasicParsing + Write-Host "Response status code of API call to https://$($NODE_IP):10250/pods : $($response.StatusCode)" + if (![string]::IsNullOrEmpty($response) -and $response.StatusCode -eq 200) { + Write-Host "API call to https://$($NODE_IP):10250/pods succeeded" + $isPodsAPISuccess = $true + } + } + catch { + Write-Host "API call to https://$($NODE_IP):10250/pods failed" + } + } + + if ($isPodsAPISuccess) { + if (![string]::IsNullOrEmpty($response.Content)) { + $podList = $response.Content | ConvertFrom-Json + if (![string]::IsNullOrEmpty($podList)) { + $podItems = $podList.Items + if ($podItems.Length -gt 0) { + Write-Host "found pod items: $($podItems.Length)" + for ($index = 0; $index -le $podItems.Length ; $index++) { + Write-Host "current podItem index : $($index)" + $pod = $podItems[$index] + if (![string]::IsNullOrEmpty($pod) -and + ![string]::IsNullOrEmpty($pod.status) -and + ![string]::IsNullOrEmpty($pod.status.phase) -and + $pod.status.phase -eq "Running" -and + $pod.status.ContainerStatuses.Length -gt 0) { + $containerID = $pod.status.ContainerStatuses[0].containerID + $detectedContainerRuntime = $containerID.split(":")[0].trim() + Write-Host "detected containerRuntime as : $($detectedContainerRuntime)" + if (![string]::IsNullOrEmpty($detectedContainerRuntime) -and [string]$detectedContainerRuntime.StartsWith('docker') -eq $false) { + $containerRuntime = $detectedContainerRuntime + } + Write-Host "using containerRuntime as : $($containerRuntime)" + break + } + } + } + else { + Write-Host "got podItems count is 0 hence using default container runtime: $($containerRuntime)" + } + + + } + else { + Write-Host "got podList null or empty hence using default container runtime: $($containerRuntime)" + } + } + else { + Write-Host "got empty response content for /Pods API call hence using default container runtime: $($containerRuntime)" + } + } + } + else { + Write-Host "got empty NODE_IP environment variable" + } + # set CONTAINER_RUNTIME env for debug and telemetry purpose + [System.Environment]::SetEnvironmentVariable("CONTAINER_RUNTIME", $containerRuntime, "Process") + [System.Environment]::SetEnvironmentVariable("CONTAINER_RUNTIME", $containerRuntime, "Machine") + } + catch { + $e = $_.Exception + Write-Host $e + Write-Host "exception occured on getting container runtime hence using default container runtime: $($containerRuntime)" + } + + return $containerRuntime +} + +function Start-Fluent { + # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service. # Run fluent-bit as a background job. Switch this to a windows service once fluent-bit supports natively running as a windows service Start-Job -ScriptBlock { Start-Process -NoNewWindow -FilePath "C:\opt\fluent-bit\bin\fluent-bit.exe" -ArgumentList @("-c", "C:\etc\fluent-bit\fluent-bit.conf", "-e", "C:\opt\omsagentwindows\out_oms.so") } + $containerRuntime = Get-ContainerRuntime + #register fluentd as a service and start # there is a known issues with win32-service https://github.com/chef/win32-service/issues/70 + if (![string]::IsNullOrEmpty($containerRuntime) -and [string]$containerRuntime.StartsWith('docker') -eq $false) { + # change parser from docker to cri if the container runtime is not docker + Write-Host "changing parser from Docker to CRI since container runtime : $($containerRuntime) and which is non-docker" + (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf','fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf + } + fluentd --reg-winsvc i --reg-winsvc-auto-start --winsvc-name fluentdwinaks --reg-winsvc-fluentdopt '-c C:/etc/fluent/fluent.conf -o C:/etc/fluent/fluent.log' Notepad.exe | Out-Null } -function Generate-Certificates -{ +function Generate-Certificates { Write-Host "Generating Certificates" C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe } -function Test-CertificatePath -{ +function Test-CertificatePath { $certLocation = $env:CI_CERT_LOCATION - $keyLocation = $env:CI_KEY_LOCATION - if (!(Test-Path $certLocation)) - { + $keyLocation = $env:CI_KEY_LOCATION + if (!(Test-Path $certLocation)) { Write-Host "Certificate file not found at $($certLocation). EXITING....." exit 1 } - else - { + else { Write-Host "Certificate file found at $($certLocation)" } - if (! (Test-Path $keyLocation)) - { + if (! (Test-Path $keyLocation)) { Write-Host "Key file not found at $($keyLocation). EXITING...." exit 1 } - else - { + else { Write-Host "Key file found at $($keyLocation)" } } @@ -172,7 +293,7 @@ Test-CertificatePath Start-Fluent # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 -Get-WmiObject Win32_process | Where-Object {$_.Name -match 'powershell'} | Format-Table -Property Name, CommandLine, ProcessId +Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | Format-Table -Property Name, CommandLine, ProcessId #check if fluentd service is running Get-Service fluentdwinaks From bac8a32aa72b50a2e1ac1844404d7dbdb9ed4d04 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 19 Aug 2020 19:16:31 -0700 Subject: [PATCH 011/194] Gangams/arc k8s metrics (#413) * cluster identity token * wip * fix exception * fix exceptions * fix exception * fix bug * fix bug * minor update * refactor the code * more refactoring * fix bug * typo fix * fix typo * wait for 1min after token renewal request * add proxy support for arc k8s mdm endpoint * avoid additional get call * minor line ending fix * wip * have separate log for arc k8s cluster identity * fix bug on creating crd resource * remove update permission since not required * fixed some bugs * fix pr feedback * remove list since its not required --- README.md | 6 +- build/linux/Makefile | 2 +- .../installer/datafiles/base_container.data | 6 +- .../templates/omsagent-arc-k8s-crd.yaml | 9 + .../templates/omsagent-rbac.yaml | 8 + .../build-and-publish-docker-image.sh | 2 +- .../build-and-publish-docker-image.ps1 | 2 +- source/plugins/ruby/KubernetesApiClient.rb | 17 +- .../plugins/ruby/arc_k8s_cluster_identity.rb | 216 ++++++++++++++++++ source/plugins/ruby/out_mdm.rb | 61 +++-- 10 files changed, 307 insertions(+), 22 deletions(-) create mode 100644 charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml create mode 100644 source/plugins/ruby/arc_k8s_cluster_identity.rb diff --git a/README.md b/README.md index 659fe0161..d5d874c9c 100644 --- a/README.md +++ b/README.md @@ -200,11 +200,15 @@ docker build -t /: --build-arg IMAGE_TAG= . docker push /: ``` -### Build Cert generator, Out OMS Plugun and Docker Image and Publish Docker Image +### Build Cert generator, Out OMS Plugin and Docker Image and Publish Docker Image If you have code cloned on to windows, you can built everything for windows agent on windows machine via below instructions ``` +# install pre-requisites if you havent installed already +cd %userprofile%\Docker-Provider\kubernetes\windows # based on your repo path +.\install-build-pre-requisites.ps1 + cd %userprofile%\Docker-Provider\kubernetes\windows\dockerbuild # based on your repo path docker login # if you want to publish the image to acr then login to acr via `docker login ` powershell -ExecutionPolicy bypass # switch to powershell if you are not on powershell already diff --git a/build/linux/Makefile b/build/linux/Makefile index 0a20ed205..3f35e1204 100644 --- a/build/linux/Makefile +++ b/build/linux/Makefile @@ -118,7 +118,7 @@ distclean : clean PROVIDER_STATUS: @echo "========================= Performing Building provider" @echo "clean up everything under: $(INTERMEDIATE_BASE_DIR) to avoid picking up old binaries" - $(RMDIR) $(INTERMEDIATE_BASE_DIR) + sudo $(RMDIR) $(INTERMEDIATE_BASE_DIR) KIT_STATUS: @echo "========================= Performing Building provider tests" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index fc5a6c8bc..87b89b14c 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -50,7 +50,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/kubernetes_container_inventory.rb; source/plugins/ruby/kubernetes_container_inventory.rb; 644; root; root /opt/microsoft/omsagent/plugin/proxy_utils.rb; source/plugins/ruby/proxy_utils.rb; 644; root; root - +/opt/microsoft/omsagent/plugin/arc_k8s_cluster_identity.rb; source/plugins/ruby/arc_k8s_cluster_identity.rb; 644; root; root /opt/microsoft/omsagent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_cadvisor2mdm.rb; source/plugins/ruby/filter_cadvisor2mdm.rb; 644; root; root /opt/microsoft/omsagent/plugin/filter_telegraf2mdm.rb; source/plugins/ruby/filter_telegraf2mdm.rb; 644; root; root @@ -276,6 +276,10 @@ touch /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log +touch /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log +chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log + mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf diff --git a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml new file mode 100644 index 000000000..f7873de40 --- /dev/null +++ b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml @@ -0,0 +1,9 @@ +{{- if contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower) }} +apiVersion: clusterconfig.azure.com/v1beta1 +kind: AzureClusterIdentityRequest +metadata: + name: container-insights-clusteridentityrequest + namespace: azure-arc +spec: + audience: https://monitoring.azure.com/ +{{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index 9903f41ff..4f7408e7c 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -27,8 +27,16 @@ rules: - apiGroups: ["azmon.container.insights"] resources: ["healthstates"] verbs: ["get", "create", "patch"] +- apiGroups: ["clusterconfig.azure.com"] + resources: ["azureclusteridentityrequests"] + resourceNames: ["container-insights-clusteridentityrequest"] + verbs: ["get", "create", "patch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] +- apiGroups: [""] + resources: ["secrets"] + resourceNames: ["container-insights-clusteridentityrequest-token"] + verbs: ["get"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 diff --git a/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh b/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh index 982c8c491..267f15f32 100644 --- a/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh +++ b/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh @@ -127,7 +127,7 @@ baseDir=$(dirname $kubernetsDir) buildDir=$baseDir/build/linux dockerFileDir=$baseDir/kubernetes/linux -echo "sour code base directory: $baseDir" +echo "source code base directory: $baseDir" echo "build directory for docker provider: $buildDir" echo "docker file directory: $dockerFileDir" diff --git a/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 b/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 index 27be90d48..dbcfa6097 100644 --- a/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 +++ b/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 @@ -35,7 +35,7 @@ $imagerepo = $imageparts[0] if ($imagetag.StartsWith("win-") -eq $false) { Write-Host "adding win- prefix image tag since its not provided" - $imagetag = "win"-$imagetag + $imagetag = "win-$imagetag" } Write-Host "image tag used is :$imagetag" diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 987d290aa..36dcdd8c6 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -99,7 +99,6 @@ def getResourceUri(resource, api_group) elsif api_group == @@ApiGroupHPA return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + @@ApiGroupHPA + "/" + @@ApiVersionHPA + "/" + resource end - else @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") return nil @@ -743,7 +742,7 @@ def getResourcesAndContinuationToken(uri, api_group: nil) resourceInventory = nil begin @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" - resourceInfo = getKubeResourceInfo(uri, api_group:api_group) + resourceInfo = getKubeResourceInfo(uri, api_group: api_group) @Log.info "KubernetesApiClient::getResourcesAndContinuationToken : Done getting resources from Kube API using url: #{uri} @ #{Time.now.utc.iso8601}" if !resourceInfo.nil? @Log.info "KubernetesApiClient::getResourcesAndContinuationToken:Start:Parsing data for #{uri} using yajl @ #{Time.now.utc.iso8601}" @@ -761,5 +760,19 @@ def getResourcesAndContinuationToken(uri, api_group: nil) end return continuationToken, resourceInventory end #getResourcesAndContinuationToken + + def getKubeAPIServerUrl + apiServerUrl = nil + begin + if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] + apiServerUrl = "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}" + else + @Log.warn "Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri" + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getKubeAPIServerUrl:Failed #{errorStr}" + end + return apiServerUrl + end end end diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb new file mode 100644 index 000000000..ef55c3257 --- /dev/null +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -0,0 +1,216 @@ +# frozen_string_literal: true +require "logger" +require "net/http" +require "net/https" +require "uri" +require "yajl/json_gem" +require "base64" +require "time" +require_relative "KubernetesApiClient" +require_relative "ApplicationInsightsUtility" + +class ArcK8sClusterIdentity + # this arc k8s crd version and arc k8s uses corresponding version v1beta1 vs v1 based on the k8s version for apiextensions.k8s.io + @@cluster_config_crd_api_version = "clusterconfig.azure.com/v1beta1" + @@cluster_identity_resource_name = "container-insights-clusteridentityrequest" + @@cluster_identity_resource_namespace = "azure-arc" + @@cluster_identity_token_secret_namespace = "azure-arc" + @@crd_resource_uri_template = "%{kube_api_server_url}/apis/%{cluster_config_crd_api_version}/namespaces/%{cluster_identity_resource_namespace}/azureclusteridentityrequests/%{cluster_identity_resource_name}" + @@secret_resource_uri_template = "%{kube_api_server_url}/api/v1/namespaces/%{cluster_identity_token_secret_namespace}/secrets/%{token_secret_name}" + @@azure_monitor_custom_metrics_audience = "https://monitoring.azure.com/" + @@cluster_identity_request_kind = "AzureClusterIdentityRequest" + + def initialize + @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" + @log = Logger.new(@LogPath, 1, 5000000) + @log.info "initialize start @ #{Time.now.utc.iso8601}" + @token_expiry_time = Time.now + @cached_access_token = String.new + @token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + @cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + @kube_api_server_url = KubernetesApiClient.getKubeAPIServerUrl + if @kube_api_server_url.nil? + @log.warn "got api server url nil from KubernetesApiClient.getKubeAPIServerUrl @ #{Time.now.utc.iso8601}" + end + @http_client = get_http_client + @service_account_token = get_service_account_token + @log.info "initialize complete @ #{Time.now.utc.iso8601}" + end + + def get_cluster_identity_token() + begin + # get the cluster msi identity token either if its empty or near expirty. Token is valid 24 hrs. + if @cached_access_token.to_s.empty? || (Time.now + 60 * 60 > @token_expiry_time) # Refresh token 1 hr from expiration + # renew the token if its near expiry + if !@cached_access_token.to_s.empty? && (Time.now + 60 * 60 > @token_expiry_time) + @log.info "renewing the token since its near expiry @ #{Time.now.utc.iso8601}" + renew_near_expiry_token + # sleep 60 seconds to get the renewed token available + sleep 60 + end + @log.info "get token reference from crd @ #{Time.now.utc.iso8601}" + tokenReference = get_token_reference_from_crd + if !tokenReference.nil? && !tokenReference.empty? + @token_expiry_time = Time.parse(tokenReference["expirationTime"]) + token_secret_name = tokenReference["secretName"] + token_secret_data_name = tokenReference["dataName"] + # get the token from secret + @log.info "get token from secret @ #{Time.now.utc.iso8601}" + token = get_token_from_secret(token_secret_name, token_secret_data_name) + if !token.nil? + @cached_access_token = token + else + @log.warn "got token nil from secret: #{@token_secret_name}" + end + else + @log.warn "got token reference either nil or empty" + end + end + rescue => err + @log.warn "get_cluster_identity_token failed: #{err}" + ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) + end + return @cached_access_token + end + + private + + def get_token_from_secret(token_secret_name, token_secret_data_name) + token = nil + begin + secret_request_uri = @@secret_resource_uri_template % { + kube_api_server_url: @kube_api_server_url, + cluster_identity_token_secret_namespace: @@cluster_identity_token_secret_namespace, + token_secret_name: token_secret_name, + } + get_request = Net::HTTP::Get.new(secret_request_uri) + get_request["Authorization"] = "Bearer #{@service_account_token}" + @log.info "Making GET request to #{secret_request_uri} @ #{Time.now.utc.iso8601}" + get_response = @http_client.request(get_request) + @log.info "Got response of #{get_response.code} for #{secret_request_uri} @ #{Time.now.utc.iso8601}" + if get_response.code.to_i == 200 + token_secret = JSON.parse(get_response.body)["data"] + cluster_identity_token = token_secret[token_secret_data_name] + token = Base64.decode64(cluster_identity_token) + end + rescue => err + @log.warn "get_token_from_secret API call failed: #{err}" + ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) + end + return token + end + + private + + def get_token_reference_from_crd() + tokenReference = {} + begin + crd_request_uri = @@crd_resource_uri_template % { + kube_api_server_url: @kube_api_server_url, + cluster_config_crd_api_version: @@cluster_config_crd_api_version, + cluster_identity_resource_namespace: @@cluster_identity_resource_namespace, + cluster_identity_resource_name: @@cluster_identity_resource_name, + } + get_request = Net::HTTP::Get.new(crd_request_uri) + get_request["Authorization"] = "Bearer #{@service_account_token}" + @log.info "Making GET request to #{crd_request_uri} @ #{Time.now.utc.iso8601}" + get_response = @http_client.request(get_request) + @log.info "Got response of #{get_response.code} for #{crd_request_uri} @ #{Time.now.utc.iso8601}" + if get_response.code.to_i == 200 + status = JSON.parse(get_response.body)["status"] + tokenReference["expirationTime"] = status["expirationTime"] + tokenReference["secretName"] = status["tokenReference"]["secretName"] + tokenReference["dataName"] = status["tokenReference"]["dataName"] + end + rescue => err + @log.warn "get_token_reference_from_crd call failed: #{err}" + ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) + end + return tokenReference + end + + private + + def renew_near_expiry_token() + begin + crd_request_uri = @@crd_resource_uri_template % { + kube_api_server_url: @kube_api_server_url, + cluster_config_crd_api_version: @@cluster_config_crd_api_version, + cluster_identity_resource_namespace: @@cluster_identity_resource_namespace, + cluster_identity_resource_name: @@cluster_identity_resource_name, + } + crd_request_body = get_crd_request_body + crd_request_body_json = crd_request_body.to_json + update_request = Net::HTTP::Patch.new(crd_request_uri) + update_request["Content-Type"] = "application/merge-patch+json" + update_request["Authorization"] = "Bearer #{@service_account_token}" + update_request.body = crd_request_body_json + update_response = @http_client.request(update_request) + @log.info "Got response of #{update_response.code} for PATCH #{crd_request_uri} @ #{Time.now.utc.iso8601}" + if update_response.code.to_i == 404 + @log.info "since crd resource doesnt exist since creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" + create_request = Net::HTTP::Post.new(crd_request_uri) + create_request["Content-Type"] = "application/json" + create_request["Authorization"] = "Bearer #{@service_account_token}" + create_request.body = crd_request_body_json + create_response = @http_client.request(create_request) + @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" + end + rescue => err + @log.warn "renew_near_expiry_token call failed: #{err}" + ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) + end + end + + private + + def get_service_account_token() + begin + if File.exist?(@token_file_path) && File.readable?(@token_file_path) + token_str = File.read(@token_file_path).strip + return token_str + else + @log.warn "Unable to read token string from #{@token_file_path}" + return nil + end + rescue => err + @log.warn "get_service_account_token call failed: #{err}" + ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) + end + end + + private + + def get_http_client() + begin + base_api_server_url = URI.parse(@kube_api_server_url) + http = Net::HTTP.new(base_api_server_url.host, base_api_server_url.port) + http.use_ssl = true + if !File.exist?(@cert_file_path) + raise "#{@cert_file_path} doesnt exist" + else + http.ca_file = @cert_file_path + end + http.verify_mode = OpenSSL::SSL::VERIFY_PEER + return http + rescue => err + @log.warn "Unable to create http client #{err}" + ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) + end + return nil + end + + private + + def get_crd_request_body + body = {} + body["apiVersion"] = @@cluster_config_crd_api_version + body["kind"] = @@cluster_identity_request_kind + body["metadata"] = {} + body["metadata"]["name"] = @@cluster_identity_resource_name + body["metadata"]["namespace"] = @@cluster_identity_resource_namespace + body["spec"] = {} + body["spec"]["audience"] = @@azure_monitor_custom_metrics_audience + return body + end +end diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index d801edb9a..b28c17034 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -16,6 +16,8 @@ def initialize require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "constants" + require_relative "arc_k8s_cluster_identity" + require_relative "proxy_utils" @@token_resource_url = "https://monitoring.azure.com/" @@grant_type = "client_credentials" @@ -45,6 +47,8 @@ def initialize @useMsi = false @metrics_flushed_count = 0 + @cluster_identity = nil + @isArcK8sCluster = false @get_access_token_backoff_expiry = Time.now end @@ -76,28 +80,48 @@ def start if @can_send_data_to_mdm @log.info "MDM Metrics supported in #{aks_region} region" + if aks_resource_id.downcase.include?("microsoft.kubernetes/connectedclusters") + @isArcK8sCluster = true + end @@post_request_url = @@post_request_url_template % { aks_region: aks_region, aks_resource_id: aks_resource_id } @post_request_uri = URI.parse(@@post_request_url) - @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) + if (!!@isArcK8sCluster) + proxy = (ProxyUtils.getProxyConfiguration) + if proxy.nil? || proxy.empty? + @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) + else + @log.info "Proxy configured on this cluster: #{aks_resource_id}" + @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port, proxy[:addr], proxy[:port], proxy[:user], proxy[:pass]) + end + else + @http_client = Net::HTTP.new(@post_request_uri.host, @post_request_uri.port) + end @http_client.use_ssl = true @log.info "POST Request url: #{@@post_request_url}" ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) - # Check to see if SP exists, if it does use SP. Else, use msi - sp_client_id = @data_hash["aadClientId"] - sp_client_secret = @data_hash["aadClientSecret"] - - if (!sp_client_id.nil? && !sp_client_id.empty? && sp_client_id.downcase != "msi") - @useMsi = false - aad_token_url = @@aad_token_url_template % { tenant_id: @data_hash["tenantId"] } - @parsed_token_uri = URI.parse(aad_token_url) + # arc k8s cluster uses cluster identity + if (!!@isArcK8sCluster) + @log.info "using cluster identity token since cluster is azure arc k8s cluster" + @cluster_identity = ArcK8sClusterIdentity.new + @cached_access_token = @cluster_identity.get_cluster_identity_token else - @useMsi = true - msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } - @parsed_token_uri = URI.parse(msi_endpoint) - end + # Check to see if SP exists, if it does use SP. Else, use msi + sp_client_id = @data_hash["aadClientId"] + sp_client_secret = @data_hash["aadClientSecret"] + + if (!sp_client_id.nil? && !sp_client_id.empty? && sp_client_id.downcase != "msi") + @useMsi = false + aad_token_url = @@aad_token_url_template % { tenant_id: @data_hash["tenantId"] } + @parsed_token_uri = URI.parse(aad_token_url) + else + @useMsi = true + msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } + @parsed_token_uri = URI.parse(msi_endpoint) + end - @cached_access_token = get_access_token + @cached_access_token = get_access_token + end end rescue => e @log.info "exception when initializing out_mdm #{e}" @@ -226,7 +250,14 @@ def write(chunk) def send_to_mdm(post_body) begin - access_token = get_access_token + if (!!@isArcK8sCluster) + if @cluster_identity.nil? + @cluster_identity = ArcK8sClusterIdentity.new + end + access_token = @cluster_identity.get_cluster_identity_token + else + access_token = get_access_token + end request = Net::HTTP::Post.new(@post_request_uri.request_uri) request["Content-Type"] = "application/x-ndjson" request["Authorization"] = "Bearer #{access_token}" From ab03640d2314b1e37a8a248c086b40adf5a2dbe4 Mon Sep 17 00:00:00 2001 From: bragi92 Date: Thu, 20 Aug 2020 17:51:57 -0700 Subject: [PATCH 012/194] fix: Reverting back to ltsc2019 tag (#429) --- kubernetes/windows/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 06e11e73a..70a5f6045 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/windows/servercore@sha256:921bed01c2a023310bdbaa288edebd82c4910e536ff206b87e9cbe703ca27505 +FROM mcr.microsoft.com/windows/servercore:ltsc2019 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" From af0f98176fb85c5cb2366b6927525867c217afeb Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 27 Aug 2020 16:44:26 -0700 Subject: [PATCH 013/194] more kubelet metrics (#430) * more kubelet metrics * celan up new config --- build/linux/installer/conf/telegraf.conf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 19b6058be..28a74a3d0 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -627,6 +627,7 @@ # ACSResourceName = "$TELEMETRY_ACS_RESOURCE_NAME" # Region = "$TELEMETRY_AKS_REGION" +#kubelet-1 [[inputs.prometheus]] name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. @@ -669,6 +670,28 @@ [inputs.prometheus.tagpass] operation_type = ["create_container", "remove_container", "pull_image"] +#kubelet-2 +[[inputs.prometheus]] + name_prefix="container.azm.ms/" + ## An array of urls to scrape metrics from. + urls = ["$CADVISOR_METRICS_URL"] + + fieldpass = ["kubelet_running_pod_count","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] + + metric_version = 2 + url_tag = "scrapeUrl" + + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = true + + ## prometheus custom metrics [[inputs.prometheus]] From 7fc4d4cb03648a081dd9e0fceefc4b742e14021a Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 31 Aug 2020 18:55:20 -0700 Subject: [PATCH 014/194] fix nom issue when config is empty (#432) --- build/linux/installer/conf/telegraf-rs.conf | 4 +- build/linux/installer/conf/telegraf.conf | 2 +- .../scripts/tomlparser-npm-config.rb | 83 ++++++++++++------- 3 files changed, 56 insertions(+), 33 deletions(-) diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index 3f2f65cff..d81196330 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -615,8 +615,8 @@ $AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER [[inputs.prometheus]] #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. - urls = ["$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER"] - fielddrop = ["$AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER"] + urls = $AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER + fielddrop = $AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER metric_version = 2 url_tag = "scrapeUrl" diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 28a74a3d0..013aa1af2 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -730,7 +730,7 @@ [[inputs.prometheus]] #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. - urls = ["$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE"] + urls = $AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE metric_version = 2 url_tag = "scrapeUrl" diff --git a/build/linux/installer/scripts/tomlparser-npm-config.rb b/build/linux/installer/scripts/tomlparser-npm-config.rb index c5953836b..777fef209 100644 --- a/build/linux/installer/scripts/tomlparser-npm-config.rb +++ b/build/linux/installer/scripts/tomlparser-npm-config.rb @@ -14,9 +14,13 @@ @configSchemaVersion = "" @collect_basic_npm_metrics = false @collect_advanced_npm_metrics = false -@npm_node_url="http://$NODE_IP:10091/node-metrics" -@npm_cluster_url="http://npm-metrics-cluster-service.kube-system:9000/cluster-metrics" -@npm_basic_drop_metrics_cluster = "npm_ipset_counts" +@npm_default_setting = "[]" +@npm_node_urls = "[\"http://$NODE_IP:10091/node-metrics\"]" +@npm_cluster_urls="[\"http://npm-metrics-cluster-service.kube-system:9000/cluster-metrics\"]" +@npm_basic_drop_metrics_cluster = "[\"npm_ipset_counts\"]" +@tgfConfigFileDS = "/etc/opt/microsoft/docker-cimprov/telegraf.conf" +@tgfConfigFileRS = "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" +@replicaset = "replicaset" # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -42,37 +46,37 @@ def populateSettingValuesFromConfigMap(parsedConfig) begin if !parsedConfig.nil? && !parsedConfig[:integrations].nil? && !parsedConfig[:integrations][:azure_network_policy_manager].nil? && !parsedConfig[:integrations][:azure_network_policy_manager][:collect_advanced_metrics].nil? advanced_npm_metrics = parsedConfig[:integrations][:azure_network_policy_manager][:collect_advanced_metrics].to_s - puts "got:integrations.azure_network_policy_manager.collect_advanced_metrics='#{advanced_npm_metrics}'" + puts "config::npm::got:integrations.azure_network_policy_manager.collect_advanced_metrics='#{advanced_npm_metrics}'" if !advanced_npm_metrics.nil? && advanced_npm_metrics.strip.casecmp("true") == 0 @collect_advanced_npm_metrics = true else @collect_advanced_npm_metrics = false end - puts "set:integrations.azure_network_policy_manager.collect_advanced_metrics=#{@collect_advanced_npm_metrics}" + puts "config::npm::set:integrations.azure_network_policy_manager.collect_advanced_metrics=#{@collect_advanced_npm_metrics}" end rescue => errorStr - puts "config::error:Exception while reading config settings for npm advanced setting - #{errorStr}, using defaults" + puts "config::npm::error:Exception while reading config settings for npm advanced setting - #{errorStr}, using defaults" @collect_advanced_npm_metrics = false end begin if !parsedConfig.nil? && !parsedConfig[:integrations].nil? && !parsedConfig[:integrations][:azure_network_policy_manager].nil? && !parsedConfig[:integrations][:azure_network_policy_manager][:collect_basic_metrics].nil? basic_npm_metrics = parsedConfig[:integrations][:azure_network_policy_manager][:collect_basic_metrics].to_s - puts "got:integrations.azure_network_policy_manager.collect_basic_metrics='#{basic_npm_metrics}'" + puts "config::npm::got:integrations.azure_network_policy_manager.collect_basic_metrics='#{basic_npm_metrics}'" if !basic_npm_metrics.nil? && basic_npm_metrics.strip.casecmp("true") == 0 @collect_basic_npm_metrics = true else @collect_basic_npm_metrics = false end - puts "set:integrations.azure_network_policy_manager.collect_basic_metrics=#{@collect_basic_npm_metrics}" + puts "config::npm::set:integrations.azure_network_policy_manager.collect_basic_metrics=#{@collect_basic_npm_metrics}" end rescue => errorStr - puts "config::error:Exception while reading config settings for npm basic setting - #{errorStr}, using defaults" + puts "config::npm::error:Exception while reading config settings for npm basic setting - #{errorStr}, using defaults" @collect_basic_npm_metrics = false end end @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Config Processing********************" +puts "****************Start NPM Config Processing********************" if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it configMapSettings = parseConfigMap if !configMapSettings.nil? @@ -80,34 +84,53 @@ def populateSettingValuesFromConfigMap(parsedConfig) end else if (File.file?(@configMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + ConfigParseErrorLogger.logError("config::npm::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") end @collect_basic_npm_metrics = false @collect_advanced_npm_metrics = false end -# Write the settings to file, so that they can be set as environment variables -file = File.open("integration_npm_config_env_var", "w") -if !file.nil? + +controller = ENV["CONTROLLER_TYPE"] +tgfConfigFile = @tgfConfigFileDS + +if controller.casecmp(@replicaset) == 0 + tgfConfigFile = @tgfConfigFileRS +end + +#replace place holders in configuration file +tgfConfig = File.read(tgfConfigFile) #read returns only after closing the file + +if @collect_advanced_npm_metrics == true + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE", @npm_node_urls) + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER", @npm_cluster_urls) + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER", @npm_default_setting) +elsif @collect_basic_npm_metrics == true + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE", @npm_node_urls) + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER", @npm_cluster_urls) + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER", @npm_basic_drop_metrics_cluster) +else + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE", @npm_default_setting) + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER", @npm_default_setting) + tgfConfig = tgfConfig.gsub("$AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER", @npm_default_setting) +end + +File.open(tgfConfigFile, "w") { |file| file.puts tgfConfig } # 'file' will be closed here after it goes out of scope +puts "config::npm::Successfully substituted the NPM placeholders into #{tgfConfigFile} file for #{controller}" + +# Write the telemetry to file, so that they can be set as environment variables +telemetryFile = File.open("integration_npm_config_env_var", "w") + +if !telemetryFile.nil? if @collect_advanced_npm_metrics == true - file.write("export TELEMETRY_NPM_INTEGRATION_METRICS_ADVANCED=1\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE=#{@npm_node_url}\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER=#{@npm_cluster_url}\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER=\n") + telemetryFile.write("export TELEMETRY_NPM_INTEGRATION_METRICS_ADVANCED=1\n") elsif @collect_basic_npm_metrics == true - file.write("export TELEMETRY_NPM_INTEGRATION_METRICS_BASIC=1\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE=#{@npm_node_url}\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER=#{@npm_cluster_url}\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER=#{@npm_basic_drop_metrics_cluster}\n") - else - file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE=\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_URL_LIST_CLUSTER=\n") - file.write("export AZMON_INTEGRATION_NPM_METRICS_DROP_LIST_CLUSTER=\n") + telemetryFile.write("export TELEMETRY_NPM_INTEGRATION_METRICS_BASIC=1\n") end # Close file after writing all environment variables - file.close + telemetryFile.close else - puts "Exception while opening file for writing config environment variables" - puts "****************End Config Processing********************" -end \ No newline at end of file + puts "config::npm::Exception while opening file for writing NPM telemetry environment variables" + puts "****************End NPM Config Processing********************" +end From 281a77c8c871d6d9a3ad98715098234c1f027302 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 1 Sep 2020 16:21:04 -0700 Subject: [PATCH 015/194] support multiple docker paths when docker root is updated thru knode (#433) --- kubernetes/omsagent.yaml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index db788a37e..947620ebc 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -387,6 +387,13 @@ spec: name: host-log - mountPath: /var/lib/docker/containers name: containerlog-path + readOnly: true + - mountPath: /mnt/docker + name: containerlog-path-2 + readOnly: true + - mountPath: /mnt/containers + name: containerlog-path-3 + readOnly: true - mountPath: /etc/kubernetes/host name: azure-json-path - mountPath: /etc/omsagent-secret @@ -444,6 +451,12 @@ spec: - name: containerlog-path hostPath: path: /var/lib/docker/containers + - name: containerlog-path-2 + hostPath: + path: /mnt/docker + - name: containerlog-path-3 + hostPath: + path: /mnt/containers - name: azure-json-path hostPath: path: /etc/kubernetes @@ -528,8 +541,6 @@ spec: name: docker-sock - mountPath: /var/log name: host-log - - mountPath: /var/lib/docker/containers - name: containerlog-path - mountPath: /etc/kubernetes/host name: azure-json-path - mountPath: /etc/omsagent-secret @@ -588,9 +599,6 @@ spec: - name: host-log hostPath: path: /var/log - - name: containerlog-path - hostPath: - path: /var/lib/docker/containers - name: azure-json-path hostPath: path: /etc/kubernetes From d8d7f9feac3e402b9a004cf2a15e57e5efd445d1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 10 Sep 2020 17:22:03 -0700 Subject: [PATCH 016/194] Gangams/doc and other related updates (#434) * bring back nodeslector changes for windows agent ds * readme updates * chart updates for azure cluster resourceid and region * set cluster region during onboarding for managed clusters * wip * fix for onboarding script * add sp support for the login * update help * add sp support for powershell * script updates for sp login * wip * wip * wip * readme updates * update the links to use ci_prod branch * fix links * fix image link * some more readme updates --- README.md | 62 +++++++++------- charts/azuremonitor-containers/Chart.yaml | 2 +- .../templates/NOTES.txt | 4 +- .../templates/omsagent-arc-k8s-crd.yaml | 2 +- .../templates/omsagent-daemonset-windows.yaml | 12 +++- .../templates/omsagent-daemonset.yaml | 9 ++- .../templates/omsagent-deployment.yaml | 9 ++- .../templates/omsagent-rs-configmap.yaml | 2 +- .../templates/omsagent-secret.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 18 ++--- img/azuremonitor-containers.svg | 66 +++++++++++++++++ .../linux/acrworkflows/acrdevnamespace.yaml | 4 +- .../add-monitoring-metrics-publisher-role.md | 8 +-- .../aks/mdmonboarding/mdm_onboarding.sh | 2 +- .../mdmonboarding/mdm_onboarding_atscale.sh | 2 +- .../kubernetes/AddMonitoringOnboardingTags.sh | 2 +- scripts/onboarding/attach-monitoring-tags.md | 8 +-- .../onboarding_azuremonitor_for_containers.sh | 2 +- .../onboarding/managed/disable-monitoring.ps1 | 36 +++++++++- .../onboarding/managed/disable-monitoring.sh | 47 ++++++++++-- .../onboarding/managed/enable-monitoring.ps1 | 38 +++++++++- .../onboarding/managed/enable-monitoring.sh | 71 +++++++++++++++---- scripts/onboarding/solution-onboarding.md | 4 +- .../preview/health/HealthAgentOnboarding.ps1 | 2 +- scripts/troubleshoot/README.md | 12 ++-- scripts/troubleshoot/TroubleshootError.ps1 | 2 +- .../TroubleshootError_nonAzureK8s.ps1 | 2 +- 27 files changed, 334 insertions(+), 96 deletions(-) create mode 100644 img/azuremonitor-containers.svg diff --git a/README.md b/README.md index d5d874c9c..3eec1f344 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,6 @@ The general directory structure is: │ ├── windows/ - scripts to build the Docker image for Windows Agent │ │ ├── dockerbuild - script to build the code and docker imag, and publish docker image │ │ ├── acrworkflows/ - acr work flows for the Windows Agent container image -│ │ ├── baseimage/ - windowsservercore base image for the windows agent container │ │ ├── DockerFile - DockerFile for Windows Agent Container Image │ │ ├── main.ps1 - Windows Agent container entry point │ │ ├── setup.ps1 - setup file for Windows Agent Container Image @@ -140,7 +139,7 @@ bash ~/Docker-Provider/scripts/build/linux/install-build-pre-requisites.sh ### Build Docker Provider Shell Bundle and Docker Image and Publish Docker Image -> Note: If you are using WSL2, ensure Docker for windows running Linux containers mode to build Linux agent image successfully +> Note: If you are using WSL2, ensure `Docker for windows` running with Linux containers mode on your windows machine to build Linux agent image successfully ``` cd ~/Docker-Provider/kubernetes/linux/dockerbuild @@ -167,9 +166,23 @@ docker push /: ``` ## Windows Agent +To build the windows agent, you will have to build .NET and Go code, and docker image for windows agent. +Docker image for windows agent can only build on Windows machine with `Docker for windows` with Windows containers mode but the .NET code and Go code can be built either on Windows or Linux or WSL2. + ### Install Pre-requisites -If you are planning to build the .net and go code for windows agent on Linux machine and you have already have Docker for Windows on Windows machine, then you may skip this. +Install pre-requisites based on OS platform you will be using to build the windows agent code + +#### Option 1 - Using Windows Machine to Build the Windows agent + +``` +powershell # launch powershell with elevated admin on your windows machine +Set-ExecutionPolicy -ExecutionPolicy bypass # set the execution policy +cd %userprofile%\Docker-Provider\scripts\build\windows # based on your repo path +.\install-build-pre-requisites.ps1 # +``` + +#### Option 2 - Using WSL2 to Build the Windows agent ``` powershell # launch powershell with elevated admin on your windows machine @@ -178,20 +191,36 @@ net use z: \\wsl$\Ubuntu-16.04 # map the network drive of the ubuntu app to wind cd z:\home\sshadmin\Docker-Provider\scripts\build\windows # based on your repo path .\install-build-pre-requisites.ps1 # ``` -#### Build Certificate Generator Source code and Out OMS Go plugin code -> Note: .net and go code for windows agent can built on Ubuntu + +### Build Windows Agent code and Docker Image + +> Note: format of the windows agent imagetag will be `win-ci`. possible values for release are test, dev, preview, dogfood, prod etc. + +#### Option 1 - Using Windows Machine to Build the Windows agent + +Execute below instructions on elevated command prompt to build windows agent code and docker image, publishing the image to acr or docker hub + +``` +cd %userprofile%\Docker-Provider\kubernetes\windows\dockerbuild # based on your repo path +docker login # if you want to publish the image to acr then login to acr via `docker login ` +powershell -ExecutionPolicy bypass # switch to powershell if you are not on powershell already +.\build-and-publish-docker-image.ps1 -image /: # trigger build code and image and publish docker hub or acr +``` + +#### Option 2 - Using WSL2 to Build the Windows agent + +##### On WSL2, Build Certificate Generator Source code and Out OMS Go plugin code ``` -cd ~/Docker-Provider/build/windows # based on your repo path on ubuntu or WSL2 +cd ~/Docker-Provider/build/windows # based on your repo path on WSL2 Ubuntu app pwsh #switch to powershell .\Makefile.ps1 # trigger build and publish of .net and go code ``` -> Note: format of the imagetag will be `win-ci`. possible values for release are test, dev, preview, dogfood, prod etc. -#### Build and Push Docker Image +#### On Windows machine, build and Push Docker Image -> Note: windows container can only built on windows hence you will have to execute below commands on windows via accessing network share or copying published bits omsagentwindows under kubernetes directory on to windows machine +> Note: Docker image for windows container can only built on windows hence you will have to execute below commands on windows via accessing network share or copying published bits omsagentwindows under kubernetes directory on to windows machine ``` net use z: \\wsl$\Ubuntu-16.04 # map the network drive of the ubuntu app to windows @@ -200,21 +229,6 @@ docker build -t /: --build-arg IMAGE_TAG= . docker push /: ``` -### Build Cert generator, Out OMS Plugin and Docker Image and Publish Docker Image - -If you have code cloned on to windows, you can built everything for windows agent on windows machine via below instructions - -``` -# install pre-requisites if you havent installed already -cd %userprofile%\Docker-Provider\kubernetes\windows # based on your repo path -.\install-build-pre-requisites.ps1 - -cd %userprofile%\Docker-Provider\kubernetes\windows\dockerbuild # based on your repo path -docker login # if you want to publish the image to acr then login to acr via `docker login ` -powershell -ExecutionPolicy bypass # switch to powershell if you are not on powershell already -.\build-and-publish-docker-image.ps1 -image /: # trigger build code and image and publish docker hub or acr -``` - # Azure DevOps Build Pipeline Navigate to https://github-private.visualstudio.com/microsoft/_build?view=pipelines to see Linux and Windows Agent build pipelines. These pipelines are configured with CI triggers for ci_dev and ci_prod. diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 202494152..8976b5561 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -28,7 +28,7 @@ keywords: - kubernetes - kuberneteshealth home: https://docs.microsoft.com/en-us/azure/monitoring/monitoring-container-health -icon: https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/img/azuremonitor-containers.svg +icon: https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/img/azuremonitor-containers.svg sources: - https://github.com/microsoft/Docker-Provider/tree/ci_prod maintainers: diff --git a/charts/azuremonitor-containers/templates/NOTES.txt b/charts/azuremonitor-containers/templates/NOTES.txt index 6179b6f1a..372cecb95 100644 --- a/charts/azuremonitor-containers/templates/NOTES.txt +++ b/charts/azuremonitor-containers/templates/NOTES.txt @@ -14,7 +14,7 @@ {{- end }} -{{- if and (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") }} +{{- if and (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") }} ############################################################################## #### ERROR: You did not provide cluster name #### @@ -22,7 +22,7 @@ {{- end }} -{{- if or (eq .Values.omsagent.secret.key "") (eq .Values.omsagent.secret.wsid "") (and (eq .Values.omsagent.env.clusterName "") (eq .Values.omsagent.env.clusterId ""))}} +{{- if or (eq .Values.omsagent.secret.key "") (eq .Values.omsagent.secret.wsid "") (and (eq .Values.omsagent.env.clusterName "") (eq .Values.omsagent.env.clusterId "") (eq .Values.Azure.Cluster.ResourceId "") )}} This deployment will not complete. To proceed, run helm upgrade {{ .Release.Name }} \ diff --git a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml index f7873de40..ebdd5ea3f 100644 --- a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml @@ -1,4 +1,4 @@ -{{- if contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower) }} +{{- if or ( contains "microsoft.kubernetes/connectedclusters" (.Values.Azure.Cluster.ResourceId | lower) ) ( contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower)) }} apiVersion: clusterconfig.azure.com/v1beta1 kind: AzureClusterIdentityRequest metadata: diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index b8e667398..7acd46c37 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -25,6 +25,8 @@ spec: dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" spec: + nodeSelector: + beta.kubernetes.io/os: windows {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} @@ -46,6 +48,13 @@ spec: - name: AKS_REGION value: {{ .Values.omsagent.env.clusterRegion | quote }} {{- end }} + {{- else if ne .Values.Azure.Cluster.ResourceId "" }} + - name: AKS_RESOURCE_ID + value: {{ .Values.Azure.Cluster.ResourceId | quote }} + {{- if ne .Values.Azure.Cluster.Region "" }} + - name: AKS_REGION + value: {{ .Values.Azure.Cluster.Region | quote }} + {{- end }} {{- else }} - name: ACS_RESOURCE_NAME value: {{ .Values.omsagent.env.clusterName | quote }} @@ -80,9 +89,6 @@ spec: - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd periodSeconds: 60 initialDelaySeconds: 180 - {{- with .Values.omsagent.daemonsetwindows.affinity }} - affinity: {{- toYaml . | nindent 8 }} - {{- end }} {{- with .Values.omsagent.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index d6d6171cd..7514247a0 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -1,4 +1,4 @@ -{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId ""))}} +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} apiVersion: apps/v1 kind: DaemonSet metadata: @@ -46,6 +46,13 @@ spec: - name: AKS_REGION value: {{ .Values.omsagent.env.clusterRegion | quote }} {{- end }} + {{- else if ne .Values.Azure.Cluster.ResourceId "" }} + - name: AKS_RESOURCE_ID + value: {{ .Values.Azure.Cluster.ResourceId | quote }} + {{- if ne .Values.Azure.Cluster.Region "" }} + - name: AKS_REGION + value: {{ .Values.Azure.Cluster.Region | quote }} + {{- end }} {{- else }} - name: ACS_RESOURCE_NAME value: {{ .Values.omsagent.env.clusterName | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 6f8140eb6..7d7ac7040 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -1,4 +1,4 @@ -{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId ""))}} +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} apiVersion: apps/v1 kind: Deployment metadata: @@ -47,6 +47,13 @@ spec: - name: AKS_REGION value: {{ .Values.omsagent.env.clusterRegion | quote }} {{- end }} + {{- else if ne .Values.Azure.Cluster.ResourceId "" }} + - name: AKS_RESOURCE_ID + value: {{ .Values.Azure.Cluster.ResourceId | quote }} + {{- if ne .Values.Azure.Cluster.Region "" }} + - name: AKS_REGION + value: {{ .Values.Azure.Cluster.Region | quote }} + {{- end }} {{- else }} - name: ACS_RESOURCE_NAME value: {{ .Values.omsagent.env.clusterName | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml index c77fb12b4..ee0664495 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml @@ -1,4 +1,4 @@ -{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId ""))}} +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} kind: ConfigMap apiVersion: v1 data: diff --git a/charts/azuremonitor-containers/templates/omsagent-secret.yaml b/charts/azuremonitor-containers/templates/omsagent-secret.yaml index c6d992b82..1a7f087ed 100644 --- a/charts/azuremonitor-containers/templates/omsagent-secret.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-secret.yaml @@ -1,4 +1,4 @@ -{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId ""))}} +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} apiVersion: v1 kind: Secret metadata: diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 610e109ef..4d0d7f8f2 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -3,7 +3,12 @@ # Declare variables to be passed into your templates. ## Microsoft OMS Agent image for kubernetes cluster monitoring -## ref: https://github.com/Microsoft/OMS-docker/tree/ci_feature_prod +## ref: https://github.com/microsoft/Docker-Provider/tree/ci_prod +## Values of ResourceId and Region under Azure->Cluster being populated by Azure Arc K8s RP during the installation of the extension +Azure: + Cluster: + Region: + ResourceId: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" @@ -81,17 +86,6 @@ omsagent: operator: NotIn values: - master - daemonsetwindows: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - labelSelector: - matchExpressions: - - key: beta.kubernetes.io/os - operator: In - values: - - windows ## Configure resource requests and limits ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## diff --git a/img/azuremonitor-containers.svg b/img/azuremonitor-containers.svg new file mode 100644 index 000000000..b2f7c5323 --- /dev/null +++ b/img/azuremonitor-containers.svg @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/kubernetes/linux/acrworkflows/acrdevnamespace.yaml b/kubernetes/linux/acrworkflows/acrdevnamespace.yaml index 9270be755..6a3617f6b 100644 --- a/kubernetes/linux/acrworkflows/acrdevnamespace.yaml +++ b/kubernetes/linux/acrworkflows/acrdevnamespace.yaml @@ -1,5 +1,5 @@ version: 1.0-preview-1 steps: - build: -t {{.Run.Registry}}/public/azuremonitor/containerinsights/cidev:{{.Run.Branch}}-{{.Run.Date}}-{{.Run.Commit | substr 0 7 }} . - workingDirectory: ci_feature - - push: ["{{.Run.Registry}}/public/azuremonitor/containerinsights/cidev:{{.Run.Branch}}-{{.Run.Date}}-{{.Run.Commit | substr 0 7 }}"] + workingDirectory: ci_dev + - push: ["{{.Run.Registry}}/public/azuremonitor/containerinsights/cidev:{{.Run.Branch}}-{{.Run.Date}}-{{.Run.Commit | substr 0 7 }}"] diff --git a/scripts/onboarding/add-monitoring-metrics-publisher-role.md b/scripts/onboarding/add-monitoring-metrics-publisher-role.md index 822ff0f64..91b91d872 100644 --- a/scripts/onboarding/add-monitoring-metrics-publisher-role.md +++ b/scripts/onboarding/add-monitoring-metrics-publisher-role.md @@ -16,7 +16,7 @@ Of the built-in roles, only Owner and User Access Administrator are granted acce ### For single AKS cluster using Azure CLI ``` sh -curl -sL https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/aks/mdmonboarding/mdm_onboarding.sh | bash -s +curl -sL https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.sh | bash -s ``` The configuration change can take a few minutes to complete. When it finishes, you see a message similar to the following that includes the result: @@ -28,7 +28,7 @@ completed the role assignment ### For all AKS clusters in the specified subscription using Azure CLI ``` sh -curl -sL https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/aks/mdmonboarding/mdm_onboarding_atscale.sh | bash -s +curl -sL https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.sh | bash -s ``` The configuration change can take a few minutes to complete. When it finishes, you see a message similar to the following that includes the result: @@ -43,7 +43,7 @@ completed role assignments for all AKS clusters in subscription: /resourceGroups//providers/Microsoft.OperationalInsights/workspaces/" "clusterName of AKS-Engine cluster" +# https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/aksengine/kubernetes/AddMonitoringOnboardingTags.sh | bash -s "name of the cloud" "00000000-0000-0000-0000-000000000000" "Resource Group Name of AKS-Engine cluster" "/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/" "clusterName of AKS-Engine cluster" # nameoftheCloud=${1} diff --git a/scripts/onboarding/attach-monitoring-tags.md b/scripts/onboarding/attach-monitoring-tags.md index f1c9a2e32..f7a802750 100644 --- a/scripts/onboarding/attach-monitoring-tags.md +++ b/scripts/onboarding/attach-monitoring-tags.md @@ -10,7 +10,7 @@ If you are not familiar with the concepts of azure resource tags (https://docs.m ## Attach tags using Powershell Get the below powershell script files to your local computer. - - Powershell script file [AddMonitoringWorkspaceTags.ps1](https://github.com/Microsoft/OMS-docker/blob/ci_feature/docs/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1) + - Powershell script file [AddMonitoringWorkspaceTags.ps1](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1) - Refer for updating the Powershell execution policy (https://docs.microsoft.com/en-us/powershell/module/microsoft.powershell.security/set-executionpolicy?view=powershell-6) - Log analytics workspace resource Id can retrieved either Azure CLI or Powershell or Azure Portal Azure CLI @@ -50,14 +50,14 @@ The configuration change can take a few minutes to complete. When it finishes, y ``` sh -curl -sL https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature/docs/aksengine/kubernetes/AddMonitoringOnboardingTags.sh | bash -s +curl -sL https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/aksengine/kubernetes/AddMonitoringOnboardingTags.sh | bash -s Example for AKS-Engine clusters in Azure Public cloud -curl -sL https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature/docs/aksengine/kubernetes/AddMonitoringOnboardingTags.sh | bash -s "AzureCloud" "00000000-0000-0000-0000-000000000000" "my-aks-engine-cluster-rg" "/subscriptions//resourceGroups/workspaceRg/providers/Microsoft.OperationalInsights/workspaces/workspaceName" "my-aks-engine-cluster" +curl -sL https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/aksengine/kubernetes/AddMonitoringOnboardingTags.sh | bash -s "AzureCloud" "00000000-0000-0000-0000-000000000000" "my-aks-engine-cluster-rg" "/subscriptions//resourceGroups/workspaceRg/providers/Microsoft.OperationalInsights/workspaces/workspaceName" "my-aks-engine-cluster" Example for AKS-Engine clusters in Azure China cloud -curl -sL https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature/docs/aksengine/kubernetes/AddMonitoringOnboardingTags.sh | bash -s "AzureChinaCloud" "00000000-0000-0000-0000-000000000000" "my-aks-engine-cluster-rg" "/subscriptions//resourceGroups/workspaceRg/providers/Microsoft.OperationalInsights/workspaces/workspaceName" "my-aks-engine-cluster" +curl -sL https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/aksengine/kubernetes/AddMonitoringOnboardingTags.sh | bash -s "AzureChinaCloud" "00000000-0000-0000-0000-000000000000" "my-aks-engine-cluster-rg" "/subscriptions//resourceGroups/workspaceRg/providers/Microsoft.OperationalInsights/workspaces/workspaceName" "my-aks-engine-cluster" ``` diff --git a/scripts/onboarding/hybrid/onboarding_azuremonitor_for_containers.sh b/scripts/onboarding/hybrid/onboarding_azuremonitor_for_containers.sh index b66dca67d..e2afa579d 100644 --- a/scripts/onboarding/hybrid/onboarding_azuremonitor_for_containers.sh +++ b/scripts/onboarding/hybrid/onboarding_azuremonitor_for_containers.sh @@ -151,7 +151,7 @@ echo "workspaceResourceId:"$workspaceResourceId echo "workspaceGuid:"$workspaceGuid echo "adding containerinsights solution to workspace" -solution=$(az group deployment create -g $defaultWorkspaceResourceGroup --template-uri https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature_prod/docs/templates/azuremonitor-containerSolution.json --parameters workspaceResourceId=$workspaceResourceId --parameters workspaceRegion=$workspaceRegion) +solution=$(az group deployment create -g $defaultWorkspaceResourceGroup --template-uri https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/templates/azuremonitor-containerSolution.json --parameters workspaceResourceId=$workspaceResourceId --parameters workspaceRegion=$workspaceRegion) echo "getting workspace primaryshared key" workspaceKey=$(az rest --method post --uri $workspaceResourceId/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey) diff --git a/scripts/onboarding/managed/disable-monitoring.ps1 b/scripts/onboarding/managed/disable-monitoring.ps1 index 41ba2adb0..ea66cb3a3 100644 --- a/scripts/onboarding/managed/disable-monitoring.ps1 +++ b/scripts/onboarding/managed/disable-monitoring.ps1 @@ -7,6 +7,12 @@ .PARAMETER clusterResourceId Id of the Azure Managed Cluster such as Azure ARC K8s, ARO v4 etc. + .PARAMETER servicePrincipalClientId + client Id of the service principal which will be used for the azure login + .PARAMETER servicePrincipalClientSecret + client secret of the service principal which will be used for the azure login + .PARAMETER tenantId + tenantId of the service principal which will be used for the azure login .PARAMETER kubeContext (optional) kube-context of the k8 cluster to install Azure Monitor for containers HELM chart @@ -22,6 +28,11 @@ param( [Parameter(mandatory = $true)] [string]$clusterResourceId, + [string]$servicePrincipalClientId, + [Parameter(mandatory = $false)] + [string]$servicePrincipalClientSecret, + [Parameter(mandatory = $false)] + [string]$tenantId, [Parameter(mandatory = $false)] [string]$kubeContext ) @@ -33,6 +44,7 @@ $helmChartName = "azuremonitor-containers" $isArcK8sCluster = $false $isAksCluster = $false $isAroV4Cluster = $false +$isUsingServicePrincipal = $false # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts @@ -199,11 +211,24 @@ if ($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedcluster $isAroV4Cluster = $true } +if(([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and + ([string]::IsNullOrEmpty($servicePrincipalClientSecret) -eq $false) -and + ([string]::IsNullOrEmpty($tenantId) -eq $false)) { + Write-Host("Using service principal creds for the azure login since provided.") + $isUsingServicePrincipal = $true + } + $resourceParts = $clusterResourceId.Split("/") $clusterSubscriptionId = $resourceParts[2] Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -ForegroundColor Green +if ($isUsingServicePrincipal) { + $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret + Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId +} + try { Write-Host("") Write-Host("Trying to get the current Az login context...") @@ -220,8 +245,15 @@ catch { if ($null -eq $account.Account) { try { - Write-Host("Please login...") - Connect-AzAccount -subscriptionid $clusterSubscriptionId + + if ($isUsingServicePrincipal) { + $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret + Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + } else { + Write-Host("Please login...") + Connect-AzAccount -subscriptionid $clusterSubscriptionId + } } catch { Write-Host("") diff --git a/scripts/onboarding/managed/disable-monitoring.sh b/scripts/onboarding/managed/disable-monitoring.sh index f55b4e617..f20bd7d33 100644 --- a/scripts/onboarding/managed/disable-monitoring.sh +++ b/scripts/onboarding/managed/disable-monitoring.sh @@ -14,7 +14,10 @@ # 1. disable monitoring using current kube-context # bash disable_monitoring.sh --resource-id/-r -# 2. disable monitoring using specific kube-context +# 2. disable monitoring using specific kube-context using service principal creds for the azure login +# bash disable_monitoring.sh --resource-id --client-id --client-secret --tenant-id + +# 3. disable monitoring using specific kube-context # bash disable_monitoring.sh --resource-id/-r --kube-context/-k @@ -48,12 +51,18 @@ isAroV4Cluster=false clusterResourceId="" kubeconfigContext="" +# sp details for the login if provided +servicePrincipalClientId="" +servicePrincipalClientSecret="" +servicePrincipalTenantId="" +isUsingServicePrincipal=false + usage() { local basename=`basename $0` echo echo "Disable Azure Monitor for containers:" - echo "$basename --resource-id/-r [--kube-context/-k ]" + echo "$basename --resource-id/-r [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context/-k ]" } delete_helm_release() @@ -105,8 +114,13 @@ remove_monitoring_tags() { echo "deleting monitoring tags ..." - echo "login to the azure interactively" - az login --use-device-code + if [ "$isUsingServicePrincipal" = true ] ; then + echo "login to the azure using provided service principal creds" + az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId + else + echo "login to the azure interactively" + az login --use-device-code + fi echo "set the cluster subscription id: ${clusterSubscriptionId}" az account set -s ${clusterSubscriptionId} @@ -159,6 +173,9 @@ for arg in "$@"; do case "$arg" in "--resource-id") set -- "$@" "-r" ;; "--kube-context") set -- "$@" "-k" ;; + "--client-id") set -- "$@" "-c" ;; + "--client-secret") set -- "$@" "-s" ;; + "--tenant-id") set -- "$@" "-t" ;; "--help") set -- "$@" "-h" ;; "--"*) usage ;; *) set -- "$@" "$arg" @@ -167,7 +184,7 @@ done local OPTIND opt - while getopts 'hk:r:' opt; do + while getopts 'hk:c:s:t:r:' opt; do case "$opt" in h) usage @@ -183,6 +200,21 @@ done echo "clusterResourceId is $OPTARG" ;; + c) + servicePrincipalClientId="$OPTARG" + echo "servicePrincipalClientId is $OPTARG" + ;; + + s) + servicePrincipalClientSecret="$OPTARG" + echo "clientSecret is *****" + ;; + + t) + servicePrincipalTenantId="$OPTARG" + echo "service principal tenantId is $OPTARG" + ;; + ?) usage exit 1 @@ -241,6 +273,11 @@ done exit 1 fi + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi + } diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 346cdc81a..b734ba347 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -9,6 +9,12 @@ .PARAMETER clusterResourceId Id of the Azure Managed Cluster such as Azure ARC K8s, ARO v4 etc. + .PARAMETER servicePrincipalClientId + Client Id of the service principal which will be used for the azure login + .PARAMETER servicePrincipalClientSecret + Client secret of the service principal which will be used for the azure login + .PARAMETER tenantId + Azure TenantId of the service principal which will be used for the azure login .PARAMETER kubeContext (optional) kube-context of the k8 cluster to install Azure Monitor for containers HELM chart .PARAMETER workspaceResourceId (optional) @@ -34,6 +40,12 @@ param( [Parameter(mandatory = $true)] [string]$clusterResourceId, [Parameter(mandatory = $false)] + [string]$servicePrincipalClientId, + [Parameter(mandatory = $false)] + [string]$servicePrincipalClientSecret, + [Parameter(mandatory = $false)] + [string]$tenantId, + [Parameter(mandatory = $false)] [string]$kubeContext, [Parameter(mandatory = $false)] [string]$workspaceResourceId, @@ -53,6 +65,7 @@ $helmChartRepoUrl = "https://kubernetes-charts-incubator.storage.googleapis.com/ # flags to indicate the cluster types $isArcK8sCluster = $false $isAksCluster = $false +$isUsingServicePrincipal = $false if([string]::IsNullOrEmpty($helmRepoName) -eq $false){ $helmChartRepoName = $helmRepoName @@ -220,6 +233,13 @@ if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedcluste exit } +if(([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and + ([string]::IsNullOrEmpty($servicePrincipalClientSecret) -eq $false) -and + ([string]::IsNullOrEmpty($tenantId) -eq $false)) { + Write-Host("Using service principal creds for the azure login since these provided.") + $isUsingServicePrincipal = $true +} + if ($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -eq $true) { $isArcK8sCluster = $true } elseif ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -eq $true) { @@ -231,6 +251,12 @@ $clusterSubscriptionId = $resourceParts[2] Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -ForegroundColor Green +if ($isUsingServicePrincipal) { + $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret + Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId +} + try { Write-Host("") Write-Host("Trying to get the current Az login context...") @@ -247,8 +273,14 @@ catch { if ($null -eq $account.Account) { try { - Write-Host("Please login...") - Connect-AzAccount -subscriptionid $clusterSubscriptionId + if ($isUsingServicePrincipal) { + $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret + Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + } else { + Write-Host("Please login...") + Connect-AzAccount -subscriptionid $clusterSubscriptionId + } } catch { Write-Host("") @@ -498,7 +530,7 @@ try { helm repo add $helmChartRepoName $helmChartRepoUrl Write-Host("updating helm repo to get latest version of charts") helm repo update - $helmParameters = "omsagent.secret.wsid=$workspaceGUID,omsagent.secret.key=$workspacePrimarySharedKey,omsagent.env.clusterId=$clusterResourceId" + $helmParameters = "omsagent.secret.wsid=$workspaceGUID,omsagent.secret.key=$workspacePrimarySharedKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion" if([string]::IsNullOrEmpty($proxyEndpoint) -eq $false) { Write-Host("using proxy endpoint since its provided") $helmParameters = $helmParameters + ",omsagent.proxy=$proxyEndpoint" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 5a8e7e040..17c075725 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -20,17 +20,19 @@ # 1. Using Default Azure Log Analytics and no-proxy with current kube config context # bash enable-monitoring.sh --resource-id -# 2. Using Default Azure Log Analytics and no-proxy +# 2. Using Default Azure Log Analytics and no-proxy with current kube config context, and using service principal creds for the azure login +# bash enable-monitoring.sh --resource-id --client-id --client-secret --tenant-id + +# 3. Using Default Azure Log Analytics and no-proxy # bash enable-monitoring.sh --resource-id --kube-context -# 3. Using Default Azure Log Analytics and with proxy endpoint configuration +# 4. Using Default Azure Log Analytics and with proxy endpoint configuration # bash enable-monitoring.sh --resource-id --kube-context --proxy - -# 4. Using Existing Azure Log Analytics and no-proxy +# 5. Using Existing Azure Log Analytics and no-proxy # bash enable-monitoring.sh --resource-id --kube-context --workspace-id -# 5. Using Existing Azure Log Analytics and proxy +# 6. Using Existing Azure Log Analytics and proxy # bash enable-monitoring.sh --resource-id --kube-context --workspace-id --proxy set -e @@ -95,12 +97,18 @@ workspaceResourceGroup="DefaultResourceGroup-"$workspaceRegionCode workspaceGuid="" workspaceKey="" +# sp details for the login if provided +servicePrincipalClientId="" +servicePrincipalClientSecret="" +servicePrincipalTenantId="" +isUsingServicePrincipal=false + usage() { local basename=`basename $0` echo echo "Enable Azure Monitor for containers:" - echo "$basename --resource-id [--kube-context ] [--workspace-id ] [--proxy ]" + echo "$basename --resource-id [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context ] [--workspace-id ] [--proxy ]" } parse_args() @@ -120,8 +128,12 @@ for arg in "$@"; do "--kube-context") set -- "$@" "-k" ;; "--workspace-id") set -- "$@" "-w" ;; "--proxy") set -- "$@" "-p" ;; + "--client-id") set -- "$@" "-c" ;; + "--client-secret") set -- "$@" "-s" ;; + "--tenant-id") set -- "$@" "-t" ;; "--helm-repo-name") set -- "$@" "-n" ;; "--helm-repo-url") set -- "$@" "-u" ;; + "--container-log-volume") set -- "$@" "-v" ;; "--"*) usage ;; *) set -- "$@" "$arg" esac @@ -129,7 +141,7 @@ done local OPTIND opt -while getopts 'hk:r:w:p:n:u:' opt; do +while getopts 'hk:r:w:p:c:s:t:n:u:v:' opt; do case "$opt" in h) usage @@ -155,6 +167,21 @@ while getopts 'hk:r:w:p:n:u:' opt; do echo "proxyEndpoint is $OPTARG" ;; + c) + servicePrincipalClientId="$OPTARG" + echo "servicePrincipalClientId is $OPTARG" + ;; + + s) + servicePrincipalClientSecret="$OPTARG" + echo "clientSecret is *****" + ;; + + t) + servicePrincipalTenantId="$OPTARG" + echo "service principal tenantId is $OPTARG" + ;; + n) helmRepoName="$OPTARG" echo "helm repo name is $OPTARG" @@ -277,6 +304,11 @@ if [ ! -z "$proxyEndpoint" ]; then fi fi +if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true +fi + } configure_to_public_cloud() @@ -309,7 +341,9 @@ create_default_log_analytics_workspace() # extract subscription from cluster resource id local subscriptionId="$(echo $clusterResourceId | cut -d'/' -f3)" - local clusterRegion=$(az resource show --ids ${clusterResourceId} --query location) + local clusterRegion=$(az resource show --ids ${clusterResourceId} --query location -o tsv) + # convert cluster region to lower case + clusterRegion=$(echo $clusterRegion | tr "[:upper:]" "[:lower:]") echo "cluster region:" $clusterRegion # mapping fors for default Azure Log Analytics workspace @@ -464,6 +498,10 @@ install_helm_chart() echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." fi + echo "getting the region of the cluster" + clusterRegion=$(az resource show --ids ${clusterResourceId} --query location) + echo "cluster region is : ${clusterRegion}" + echo "adding helm repo:" $helmRepoName helm repo add $helmRepoName $helmRepoUrl @@ -474,18 +512,18 @@ install_helm_chart() echo "using proxy endpoint since proxy configuration passed in" if [ -z "$kubeconfigContext" ]; then echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId $helmRepoName/$helmChartName + helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName else echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} + helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} fi else if [ -z "$kubeconfigContext" ]; then echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId $helmRepoName/$helmChartName + helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName else echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} + helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} fi fi @@ -495,8 +533,13 @@ install_helm_chart() login_to_azure() { - echo "login to the azure interactively" - az login --use-device-code + if [ "$isUsingServicePrincipal" = true ] ; then + echo "login to the azure using provided service principal creds" + az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId + else + echo "login to the azure interactively" + az login --use-device-code + fi } set_azure_subscription() diff --git a/scripts/onboarding/solution-onboarding.md b/scripts/onboarding/solution-onboarding.md index 045738762..13e76530d 100644 --- a/scripts/onboarding/solution-onboarding.md +++ b/scripts/onboarding/solution-onboarding.md @@ -6,8 +6,8 @@ You can either use the Azure Powershell or Azure cli to deploy the solution. If you are not familiar with the concepts of deploying resources using a template with PowerShell, see [Deploy resources with Resource Manager templates and Azure PowerShell](https://review.docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-template-deploy) 1. Get the below template files to your local computer. - - Template file [azuremonitor-containerSolution.json](https://github.com/Microsoft/OMS-docker/blob/ci_feature_prod/docs/templates/azuremonitor-containerSolution.json) - - TemplateParams file [azuremonitor-containerSolutionParams.json](https://github.com/Microsoft/OMS-docker/blob/ci_feature_prod/docs/templates/azuremonitor-containerSolutionParams.json) + - Template file [azuremonitor-containerSolution.json](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/templates/azuremonitor-containerSolution.json) + - TemplateParams file [azuremonitor-containerSolutionParams.json](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/templates/azuremonitor-containerSolutionParams.json) 2. Edit the TemplateParams file in your local computer. * workspaceResourceId parameter : - Replace `` with Azure subscriptionID for your Workspace diff --git a/scripts/preview/health/HealthAgentOnboarding.ps1 b/scripts/preview/health/HealthAgentOnboarding.ps1 index 881dd2549..9ce8eca74 100644 --- a/scripts/preview/health/HealthAgentOnboarding.ps1 +++ b/scripts/preview/health/HealthAgentOnboarding.ps1 @@ -339,7 +339,7 @@ if ($false -eq $isSolutionOnboarded) { try { New-AzResourceGroupDeployment -Name $DeploymentName ` -ResourceGroupName $workspaceResourceGroupName ` - -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/templates/azuremonitor-containerSolution.json ` + -TemplateUri https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/templates/azuremonitor-containerSolution.json ` -TemplateParameterObject $Parameters -ErrorAction Stop` diff --git a/scripts/troubleshoot/README.md b/scripts/troubleshoot/README.md index d4e2e9cf4..5ffa07639 100644 --- a/scripts/troubleshoot/README.md +++ b/scripts/troubleshoot/README.md @@ -7,7 +7,7 @@ The table below summarizes known issues you may face while using Azure Monitor f | ---- | --- | | Error Message `No data for selected filters` | It may take some time to establish monitoring data flow for newly created clusters. Please allow at least 10-15 minutes for data to appear for your cluster. | | Error Message `Error retrieving data` | While Azure Kubenetes Service cluster is setting up for health and performance monitoring, a connection is established between the cluster and Azure Log Analytics workspace. Log Analytics workspace is used to store all monitoring data for your cluster. This error may occurr when your Log Analytics workspace has been deleted or lost. Please check whether your Log Analytics workspace is available. To find your Log Analytics workspace go [here.](https://docs.microsoft.com/en-us/azure/log-analytics/log-analytics-manage-access) and your workspace is available. If the workspace is missing, you will need to re-onboard Container Health to your cluster. To re-onboard, you will need to [opt out](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-optout) of monitoring for the cluster and [onboard](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-enable-existing-clusters) again to Container Health. | -| `Error retrieving data` after adding Container Health through az aks cli | When onboarding using az aks cli, very seldom, Container Health may not be properly onboarded. Please check whether the Container Insights Solution is onboarded. To do this, go to your [Log Analytics workspace](https://docs.microsoft.com/en-us/azure/log-analytics/log-analytics-manage-access) and see if Container Insights Solution is available by going to the "Solutions" tab under General. To resolve this issue, you will need to redeploy the Container Insights Solution. Please follow the instructions on [how to deploy Azure Monitor - container health solution to your Log Analytics workspace. ](https://github.com/Microsoft/OMS-docker/blob/ci_feature_prod/docs/solution-onboarding.md) | +| `Error retrieving data` after adding Container Health through az aks cli | When onboarding using az aks cli, very seldom, Container Health may not be properly onboarded. Please check whether the Container Insights Solution is onboarded. To do this, go to your [Log Analytics workspace](https://docs.microsoft.com/en-us/azure/log-analytics/log-analytics-manage-access) and see if Container Insights Solution is available by going to the "Solutions" tab under General. To resolve this issue, you will need to redeploy the Container Insights Solution. Please follow the instructions on [how to deploy Azure Monitor - container health solution to your Log Analytics workspace. ](https://github.com/microsoft/Docker-Provider/blob/ci_prod/scripts/onboarding/solution-onboarding.md) | | Failed to `Enable fast alerting experience on basic metrics for this Azure Kubernetes Services cluster` | The action is trying to grant the Monitoring Metrics Publisher role assignment on the cluster resource. The user initiating the process must have access to the **Microsoft.Authorization/roleAssignments/write** permission on the AKS cluster resource scope. Only members of the **Owner** and **User Access Administrator** built-in roles are granted access to this permission. If your security policies require assigning granular level permissions, we recommend you view [custom roles](https://docs.microsoft.com/en-us/azure/role-based-access-control/custom-roles) and assign it to the users who require it. | # Azure Red Hat OpenShift Service (ARO) @@ -36,7 +36,7 @@ Prequisites: # AKS or ARO -You can use the troubleshooting script provided [here](https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature_prod/Troubleshoot/TroubleshootError.ps1) to diagnose the problem. +You can use the troubleshooting script provided [here](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/troubleshoot/TroubleshootError.ps1) to diagnose the problem. Steps: - Open powershell using the [cloudshell](https://docs.microsoft.com/en-us/azure/cloud-shell/overview) in the azure portal. @@ -45,8 +45,8 @@ Steps: For Mac OS, refer [install-powershell-core-on-mac](https://docs.microsoft.com/en-us/powershell/scripting/install/installing-powershell-core-on-macos?view=powershell-6) how to install powershell - Make sure that you're using powershell (selected by default) - Run the following command to change home directory - `cd ~` -- Run the following command to download the script - `curl -LO https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature/Troubleshoot/TroubleshootError.ps1` - > Note: In some versions of Powershell above CURL command may not work in such cases, you can try `curl https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature/Troubleshoot/TroubleshootError.ps1 -O TroubleshootError.ps1` +- Run the following command to download the script - `curl -LO https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/troubleshoot/TroubleshootError.ps1` + > Note: In some versions of Powershell above CURL command may not work in such cases, you can try `curl https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/troubleshoot/TroubleshootError.ps1 -O TroubleshootError.ps1` - Run the following command to execute the script - `./TroubleshootError.ps1 -ClusterResourceId ` > Note: For AKS, resourceIdoftheCluster should be in this format `/subscriptions//resourceGroups//providers/Microsoft.ContainerService/managedClusters/`.For ARO, should be in this format `/subscriptions//resourceGroups//providers/Microsoft.ContainerService/openShiftManagedClusters/`. - This script will generate a TroubleshootDump.txt which collects detailed information about container health onboarding. @@ -54,10 +54,10 @@ Steps: # Aks-Engine Kubernetes -You can use the troubleshooting script provided [here](https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature_prod/Troubleshoot/TroubleshootError_AcsEngine.ps1) to diagnose the problem. +You can use the troubleshooting script provided [here](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1) to diagnose the problem. Steps: -- Download [TroubleshootError_AcsEngine.ps1](https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature_prod/Troubleshoot/TroubleshootError_AcsEngine.ps1), [ContainerInsightsSolution.json](https://raw.githubusercontent.com/microsoft/OMS-docker/ci_feature_prod/Troubleshoot/ContainerInsightsSolution.json) +- Download [TroubleshootError_AcsEngine.ps1](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1), [ContainerInsightsSolution.json](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/troubleshoot/ContainerInsightsSolution.json) - Collect Subscription ID, Resource group name of the Aks-Engine Kubernetes cluster - Use the following command to run the script : `.\TroubleshootError_AcsEngine.ps1 -SubscriptionId -ResourceGroupName `. This script will generate a TroubleshootDump.txt which collects detailed information about container health onboarding. diff --git a/scripts/troubleshoot/TroubleshootError.ps1 b/scripts/troubleshoot/TroubleshootError.ps1 index 7f857caa3..754a43e74 100644 --- a/scripts/troubleshoot/TroubleshootError.ps1 +++ b/scripts/troubleshoot/TroubleshootError.ps1 @@ -671,7 +671,7 @@ else { try { New-AzResourceGroupDeployment -Name $DeploymentName ` -ResourceGroupName $workspaceResourceGroupName ` - -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/templates/azuremonitor-containerSolution.json ` + -TemplateUri https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/templates/azuremonitor-containerSolution.json ` -TemplateParameterObject $Parameters -ErrorAction Stop` Write-Host("") diff --git a/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 b/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 index c7509a940..14b080b23 100644 --- a/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 +++ b/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 @@ -345,7 +345,7 @@ else { try { New-AzResourceGroupDeployment -Name $DeploymentName ` -ResourceGroupName $defaultWorkspaceResourceGroup ` - -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/templates/azuremonitor-containerSolution.json ` + -TemplateUri https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/scripts/onboarding/templates/azuremonitor-containerSolution.json ` -TemplateParameterObject $Parameters -ErrorAction Stop` Write-Host("") From 2d56087e528a145aeb06b5beb6a60092dfa41e15 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 14 Sep 2020 12:34:33 -0700 Subject: [PATCH 017/194] add missing serviceprincipal in ps scripts (#435) --- scripts/onboarding/managed/disable-monitoring.ps1 | 4 ++-- scripts/onboarding/managed/enable-monitoring.ps1 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/onboarding/managed/disable-monitoring.ps1 b/scripts/onboarding/managed/disable-monitoring.ps1 index ea66cb3a3..8945f90b6 100644 --- a/scripts/onboarding/managed/disable-monitoring.ps1 +++ b/scripts/onboarding/managed/disable-monitoring.ps1 @@ -226,7 +226,7 @@ Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -Foregr if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId } try { @@ -249,7 +249,7 @@ if ($null -eq $account.Account) { if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId } else { Write-Host("Please login...") Connect-AzAccount -subscriptionid $clusterSubscriptionId diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index b734ba347..338de6cbc 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -254,7 +254,7 @@ Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -Foregr if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId } try { @@ -276,7 +276,7 @@ if ($null -eq $account.Account) { if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId } else { Write-Host("Please login...") Connect-AzAccount -subscriptionid $clusterSubscriptionId From a28aaf025f91957f193121e66fbfb1c1f9d6abe4 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 15 Sep 2020 11:46:14 -0700 Subject: [PATCH 018/194] fix telemetry bug (#436) --- source/plugins/ruby/out_mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index b28c17034..c4cc46dd7 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -272,7 +272,7 @@ def send_to_mdm(post_body) @last_telemetry_sent_time = Time.now end rescue Net::HTTPServerException => e - if !response.nil && !response.body.nil? #body will have actual error + if !response.nil? && !response.body.nil? #body will have actual error @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" else @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" From 0062b32da17eece46f6e754c0f8a35ac57c75c92 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 16 Sep 2020 10:59:03 -0700 Subject: [PATCH 019/194] Gangams/readmeupdates non aks 09162020 (#437) * changes for ciprod09162020 non-aks release * fix script to handle cross sub scenario * fix minor comment * fix date in version file * fix pr comments --- ReleaseNotes.md | 12 +++++++++++- build/version | 4 ++-- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ kubernetes/windows/Dockerfile | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- 7 files changed, 25 insertions(+), 15 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 0f1d932a8..547d00573 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,17 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 09/16/2020 - +> Note: This agent release targetted ONLY for non-AKS clusters via Azure Monitor for containers HELM chart update +##### Version microsoft/oms:ciprod09162020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020 (linux) +##### Version microsoft/oms:win-ciprod09162020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod09162020 (windows) +##### Code change log +- Collection of Azure Network Policy Manager Basic and Advanced metrics +- Add support in Windows Agent for Container log collection of CRI runtimes such as ContainerD +- Alertable metrics support Arc K8s cluster to parity with AKS +- Support for multiple container log mount paths when docker is updated through knode +- Bug fix related to MDM telemetry + ### 08/07/2020 - ##### Version microsoft/oms:ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020 (linux) ##### Version microsoft/oms:win-ciprod08072020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08072020 (windows) @@ -26,7 +37,6 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Add region check before sending alertable metrics to MDM - Telemetry fix for agent telemetry for sov. clouds - ### 07/15/2020 - ##### Version microsoft/oms:ciprod07152020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07152020 (linux) ##### Version microsoft/oms:win-ciprod05262020-2 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod05262020-2 (windows) diff --git a/build/version b/build/version index f26973116..b53b0dcfb 100644 --- a/build/version +++ b/build/version @@ -5,8 +5,8 @@ CONTAINER_BUILDVERSION_MAJOR=10 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 -CONTAINER_BUILDVERSION_BUILDNR=4 -CONTAINER_BUILDVERSION_DATE=20200805 +CONTAINER_BUILDVERSION_BUILDNR=5 +CONTAINER_BUILDVERSION_DATE=20200916 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 4d0d7f8f2..9c48cf9fb 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,10 +12,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod08072020" - tagWindows: "win-ciprod08072020" + tag: "ciprod09162020" + tagWindows: "win-ciprod09162020" pullPolicy: IfNotPresent - dockerProviderVersion: "10.0.0-4" + dockerProviderVersion: "10.0.0-5" agentVersion: "1.10.0.1" ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index bc27a5384..ee35cd556 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod08072020 +ARG IMAGE_TAG=ciprod09162020 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 947620ebc..b71a95227 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -337,13 +337,13 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-4" + dockerProviderVersion: "10.0.0-5" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020" imagePullPolicy: IfNotPresent resources: limits: @@ -493,13 +493,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-4" + dockerProviderVersion: "10.0.0-5" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08072020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020" imagePullPolicy: IfNotPresent resources: limits: @@ -639,13 +639,13 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-4" + dockerProviderVersion: "10.0.0-5" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod08072020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod09162020" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 70a5f6045..ca89d1c80 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod08072020 +ARG IMAGE_TAG=win-ciprod09162020 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 17c075725..4142dbf6c 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -620,7 +620,7 @@ add_container_insights_solution $workspaceResourceId # get workspace guid and key get_workspace_guid_and_key $workspaceResourceId -if [ "$isClusterAndWorkspaceInSameSubscription" = true ] ; then +if [ "$isClusterAndWorkspaceInSameSubscription" = false ] ; then echo "switch to cluster subscription id as active subscription for cli: ${clusterSubscriptionId}" set_azure_subscription $clusterSubscriptionId fi From 1a7ef1cfbfe611e8d14218167c393a2becafc8f9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 16 Sep 2020 14:53:21 -0700 Subject: [PATCH 020/194] Gangams/fix weird conflicts (#439) * separate build yamls for ci_prod branch (#415) (#416) * [Merge] dev to prod for ciprod08072020 release (#424) * separate build yamls for ci_prod branch (#415) * re-enable adx path (#420) * Gangams/release changes (#419) * updates related to release * updates related to release * fix the incorrect version * fix pr feedback * fix some typos in the release notes * fix for zero filled metrics (#423) * consolidate windows agent image docker files (#422) * consolidate windows agent image docker files * revert docker file consolidation * revert readme updates * merge back windows dockerfiles * image tag update Co-authored-by: Vishwanath Co-authored-by: rashmichandrashekar Co-authored-by: Vishwanath Co-authored-by: rashmichandrashekar --- ReleaseNotes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 547d00573..499c99f02 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -37,6 +37,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Add region check before sending alertable metrics to MDM - Telemetry fix for agent telemetry for sov. clouds + ### 07/15/2020 - ##### Version microsoft/oms:ciprod07152020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07152020 (linux) ##### Version microsoft/oms:win-ciprod05262020-2 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod05262020-2 (windows) From bf75bf04ac28f1462ea358ea4762610b0cf70553 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 21 Sep 2020 10:07:52 -0700 Subject: [PATCH 021/194] fix quote issue for the region (#441) --- scripts/onboarding/managed/enable-monitoring.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 4142dbf6c..226fd978b 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -499,7 +499,7 @@ install_helm_chart() fi echo "getting the region of the cluster" - clusterRegion=$(az resource show --ids ${clusterResourceId} --query location) + clusterRegion=$(az resource show --ids ${clusterResourceId} --query location -o tsv) echo "cluster region is : ${clusterRegion}" echo "adding helm repo:" $helmRepoName From 6287724c89ae6e8d0ac74789e472c99fed28bb48 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 21 Sep 2020 14:16:21 -0700 Subject: [PATCH 022/194] fix cpucapacity/limit bug (#442) --- source/plugins/ruby/KubernetesApiClient.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 36dcdd8c6..073eb0417 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -719,6 +719,9 @@ def getMetricNumericValue(metricName, metricVal) if (metricValue.end_with?("m")) metricValue.chomp!("m") metricValue = Float(metricValue) * 1000.0 ** 2 + elsif (metricValue.end_with?("k")) + metricValue.chomp!("k") + metricValue = Float(metricValue) * 1000.0 else #assuming no units specified, it is cores that we are converting to nanocores (the below conversion will fail for other unsupported 'units') metricValue = Float(metricValue) * 1000.0 ** 3 end From bd30a47ecb9b6ea5867fbd9ceff4810d3b5d4431 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 23 Sep 2020 09:01:24 -0700 Subject: [PATCH 023/194] grwehner/pv-usage-metrics (#431) - Send persistent volume usage and capacity metrics to LA for PVs with PVCs at the pod level; config to include or exclude kube-system namespace. - Send PV usage percentage to MDM if over the configurable threshold. - Add PV usage recommended alert template. --- .../PVUsagePercentage.json | 174 ++++++++++++++++++ build/linux/installer/conf/container.conf | 2 +- build/linux/installer/conf/kube.conf | 2 +- .../installer/datafiles/base_container.data | 1 + .../scripts/tomlparser-mdm-metrics-config.rb | 32 +++- .../tomlparser-metric-collection-config.rb | 71 +++++++ kubernetes/container-azm-ms-agentconfig.yaml | 15 ++ kubernetes/linux/main.sh | 8 + kubernetes/omsagent.yaml | 2 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 77 ++++++++ source/plugins/ruby/MdmAlertTemplates.rb | 32 ++++ source/plugins/ruby/MdmMetricsGenerator.rb | 36 ++++ source/plugins/ruby/constants.rb | 11 ++ source/plugins/ruby/filter_cadvisor2mdm.rb | 76 +++++++- source/plugins/ruby/in_cadvisor_perf.rb | 1 + source/plugins/ruby/in_win_cadvisor_perf.rb | 1 + 16 files changed, 533 insertions(+), 8 deletions(-) create mode 100644 alerts/recommended_alerts_ARM/PVUsagePercentage.json create mode 100644 build/linux/installer/scripts/tomlparser-metric-collection-config.rb diff --git a/alerts/recommended_alerts_ARM/PVUsagePercentage.json b/alerts/recommended_alerts_ARM/PVUsagePercentage.json new file mode 100644 index 000000000..e6cdbee15 --- /dev/null +++ b/alerts/recommended_alerts_ARM/PVUsagePercentage.json @@ -0,0 +1,174 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "alertName": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Name of the alert" + } + }, + "alertDescription": { + "type": "string", + "defaultValue": "This is a metric alert", + "metadata": { + "description": "Description of alert" + } + }, + "alertSeverity": { + "type": "int", + "defaultValue": 3, + "allowedValues": [ + 0, + 1, + 2, + 3, + 4 + ], + "metadata": { + "description": "Severity of alert {0,1,2,3,4}" + } + }, + "isEnabled": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Specifies whether the alert is enabled" + } + }, + "clusterResourceId": { + "type": "string", + "minLength": 1, + "metadata": { + "description": "Full Resource ID of the kubernetes cluster emitting the metric that will be used for the comparison. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.ContainerService/managedClusters/cluster-xyz" + } + }, + "operator": { + "type": "string", + "defaultValue": "GreaterThan", + "allowedValues": [ + "Equals", + "NotEquals", + "GreaterThan", + "GreaterThanOrEqual", + "LessThan", + "LessThanOrEqual" + ], + "metadata": { + "description": "Operator comparing the current value with the threshold value." + } + }, + "threshold": { + "type": "int", + "defaultValue": 80, + "metadata": { + "description": "The threshold value at which the alert is activated." + }, + "minValue": 1, + "maxValue": 100 + }, + "timeAggregation": { + "type": "string", + "defaultValue": "Average", + "allowedValues": [ + "Average", + "Minimum", + "Maximum", + "Count" + ], + "metadata": { + "description": "How the data that is collected should be combined over time." + } + }, + "windowSize": { + "type": "string", + "defaultValue": "PT5M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H", + "PT6H", + "PT12H", + "PT24H" + ], + "metadata": { + "description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format." + } + }, + "evaluationFrequency": { + "type": "string", + "defaultValue": "PT1M", + "allowedValues": [ + "PT1M", + "PT5M", + "PT15M", + "PT30M", + "PT1H" + ], + "metadata": { + "description": "how often the metric alert is evaluated represented in ISO 8601 duration format" + } + }, + "actionGroupId": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "The ID of the action group that is triggered when the alert is activated or deactivated" + } + } + }, + "variables": {}, + "resources": [ + { + "name": "[parameters('alertName')]", + "type": "Microsoft.Insights/metricAlerts", + "location": "global", + "apiVersion": "2018-03-01", + "tags": {}, + "properties": { + "description": "[parameters('alertDescription')]", + "severity": "[parameters('alertSeverity')]", + "enabled": "[parameters('isEnabled')]", + "scopes": [ + "[parameters('clusterResourceId')]" + ], + "evaluationFrequency": "[parameters('evaluationFrequency')]", + "windowSize": "[parameters('windowSize')]", + "criteria": { + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria", + "allOf": [ + { + "name": "1st criterion", + "metricName": "pvUsageExceededPercentage", + "metricNamespace": "Insights.Container/persistentvolumes", + "dimensions": [ + { + "name": "kubernetesNamespace", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "podName", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "operator": "[parameters('operator')]", + "threshold": "[parameters('threshold')]", + "timeAggregation": "[parameters('timeAggregation')]", + "skipMetricValidation": true + } + ] + }, + "actions": "[if(empty(parameters('actionGroupId')), json('null'), json(concat('[{\"actionGroupId\": \"',parameters('actionGroupId'),'\"}]')))]" + } + } + ] +} diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index f02ec0131..e55c62fbc 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -46,7 +46,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes log_level info diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 9ada8425f..ba40b7a35 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -74,7 +74,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 87b89b14c..ca2538b79 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -120,6 +120,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root /opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 1c01dd8c6..345c51633 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -12,6 +12,7 @@ @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD +@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -35,7 +36,7 @@ def parseConfigMap # Use the ruby structure created after config parsing to set the right values to be used for MDM metric configuration settings def populateSettingValuesFromConfigMap(parsedConfig) if !parsedConfig.nil? && !parsedConfig[:alertable_metrics_configuration_settings].nil? - # Get mdm metrics config settings for resource utilization + # Get mdm metrics config settings for container resource utilization begin resourceUtilization = parsedConfig[:alertable_metrics_configuration_settings][:container_resource_utilization_thresholds] if !resourceUtilization.nil? @@ -66,7 +67,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - puts "config::Using config map settings for MDM metric configuration settings for resource utilization" + puts "config::Using config map settings for MDM metric configuration settings for container resource utilization" end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors") @@ -74,6 +75,32 @@ def populateSettingValuesFromConfigMap(parsedConfig) @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end + + # Get mdm metrics config settings for PV utilization + begin + isUsingPVThresholdConfig = false + pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] + if !pvUtilizationThresholds.nil? + pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage] + if !pvUsageThreshold.nil? + pvUsageThresholdFloat = pvUsageThreshold.to_f + if pvUsageThresholdFloat.kind_of? Float + @percentagePVUsageThreshold = pvUsageThresholdFloat + isUsingPVThresholdConfig = true + end + end + end + + if isUsingPVThresholdConfig + puts "config::Using config map settings for MDM metric configuration settings for PV utilization" + else + puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end end end @@ -97,6 +124,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") + file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb new file mode 100644 index 000000000..40d87b7f1 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -0,0 +1,71 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require_relative "microsoft/omsagent/plugin/constants" + +@configMapMountPath = "/etc/config/settings/metric_collection_settings" +@configVersion = "" +@configSchemaVersion = "" + +# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist +@collectPVKubeSystemMetrics = false + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for metric collection settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for metric collection settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for metric collection settings: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used for metric collection settings +def populateSettingValuesFromConfigMap(parsedConfig) + # Get metric collection settings for including or excluding kube-system namespace in PV metrics + begin + if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? + @collectPVKubeSystemMetrics = parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled] + puts "config::Using config map setting for PV kube-system collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors") + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Metric Collection Settings Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("config_metric_collection_env_var", "w") + +if !file.nil? + file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") + # Close file after writing all metric collection setting environment variables + file.close + puts "****************End Metric Collection Settings Processing********************" +else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End Metric Collection Settings Processing********************" +end diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 58e09f041..aec1bb456 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -42,6 +42,7 @@ data: # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected + prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings [prometheus_data_collection_settings.cluster] @@ -90,6 +91,15 @@ data: #fieldpass = ["metric_to_pass1", "metric_to_pass12"] #fielddrop = ["metric_to_drop"] + + metric_collection_settings: |- + # Metrics collection settings for metrics sent to Log Analytics and MDM + [metric_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected + enabled = false + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization [alertable_metrics_configuration_settings.container_resource_utilization_thresholds] @@ -100,6 +110,11 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 + + # Alertable metrics configuration settings for persistent volume utilization + [alertable_metrics_configuration_settings.pv_utilization_thresholds] + # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage + pv_usage_threshold_percentage = 60.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 311470660..d9fdc42e9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -236,6 +236,14 @@ cat config_mdm_metrics_env_var | while read line; do done source config_mdm_metrics_env_var +#Parse the configmap to set the right environment variables for metric collection settings +/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + +cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc +done +source config_metric_collection_env_var + #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" #Defaults to use port 10255 diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index b71a95227..5cda4dcb3 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -125,7 +125,7 @@ data: type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 13796cd1e..7661bb7a1 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -20,6 +20,7 @@ class CAdvisorMetricsAPIClient @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] + @pvKubeSystemCollectionMetricsEnabled = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @@ -53,6 +54,7 @@ class CAdvisorMetricsAPIClient @@winNodePrevMetricRate = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i #Containers a hash of node name and the last time telemetry was sent for this node @@nodeTelemetryTimeTracker = {} @@ -301,6 +303,8 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + + metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -311,6 +315,79 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end + def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + telemetryTimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs + telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60 + + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + clusterName = KubernetesApiClient.getClusterName + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + + podNamespace = pod["podRef"]["namespace"] + excludeNamespace = false + if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" + excludeNamespace = true + end + + if (!excludeNamespace && !pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + pvcName = pvcRef["name"] + pvcNamespace = pvcRef["namespace"] + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNameToReturn + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = pvcNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end + end + end + end + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + return metricItems + end + + # If kube-system metrics collection enabled, send telemetry + begin + if telemetryTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES && @pvKubeSystemCollectionMetricsEnabled == "true" + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT, {}) + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics kube-system metrics enabled telemetry failed: #{errorStr}") + end + + return metricItems + end + + def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index 2e516a99d..d5107fea1 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -90,6 +90,38 @@ class MdmAlertTemplates } }' + PV_resource_utilization_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolumes", + "dimNames": [ + "podName", + "node", + "kubernetesNamespace", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{pvResourceUtilizationPercentage}, + "max": %{pvResourceUtilizationPercentage}, + "sum": %{pvResourceUtilizationPercentage}, + "count": 1 + } + ] + } + } + }' + + Node_resource_metrics_template = ' { "time": "%{timestamp}", diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 3d75dc6f4..1e7db37cc 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -37,6 +37,10 @@ class MdmMetricsGenerator Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, } + @@pod_metric_name_metric_percentage_name_hash = { + Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC + } + # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @sendZeroFilledMetrics = true @@ -259,6 +263,31 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage) + records = [] + begin + containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + pvcNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] + podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] + podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] + + resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { + timestamp: recordTimeStamp, + metricName: @@pod_metric_name_metric_percentage_name_hash[metricName], + podNameDimValue: podName, + computerNameDimValue: computer, + namespaceDimValue: pvcNamespace, + pvResourceUtilizationPercentage: percentageMetricValue, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + rescue => errorStr + @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return records + end + def getDiskUsageMetricRecords(record) records = [] usedPercent = nil @@ -356,6 +385,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -375,6 +405,12 @@ def getContainerResourceUtilizationThresholds memoryWorkingSetThresholdFloat = (memoryWorkingSetThreshold.to_f).round(2) metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end + + pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] + if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? + pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) + metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index dd1ba24b3..82a6e8814 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,6 +13,12 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_POD_UID = "podUid" + INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" + INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" + INSIGHTSMETRICS_TAGS_POD_NAME = "podName" + INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" REASON_OOM_KILLED = "oomkilled" #Kubestate (common) @@ -45,6 +51,7 @@ class Constants MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" + MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" @@ -56,9 +63,11 @@ class Constants CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" + PV_USED_BYTES = "pvUsedBytes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" @@ -71,6 +80,8 @@ class Constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" + PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" + PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index fd43ef98b..3bc674ea8 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,Constants::PV_USED_BYTES" @@hostName = (OMS::Common.get_hostname) @@ -46,11 +46,13 @@ def start @metrics_to_collect_hash = build_metrics_hash @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i # These variables keep track if any resource utilization threshold exceeded in the last 10 minutes @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @pvExceededUsageThreshold = false # initialize cpu and memory limit if @process_incoming_stream @@ -60,6 +62,7 @@ def start @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @containerResourceDimensionHash = {} + @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds end rescue => e @@ -87,6 +90,8 @@ def setThresholdExceededTelemetry(metricName) @containersExceededMemRssThreshold = true elsif metricName == Constants::MEMORY_WORKING_SET_BYTES @containersExceededMemWorkingSetThreshold = true + elsif metricName == Constants::PV_USED_BYTES + @pvExceededUsageThreshold = true end rescue => errorStr @log.info "Error in setThresholdExceededTelemetry: #{errorStr}" @@ -109,13 +114,30 @@ def flushMetricTelemetry properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) - @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @log.info "Error in flushMetricTelemetry: #{errorStr} for container resource util telemetry" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + + # Also send for PV usage metrics + begin + pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs + pvTimeDifferenceInMinutes = pvTimeDifference / 60 + if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + pvProperties = {} + pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] + pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) + @pvExceededUsageThreshold = false + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr - @log.info "Error in flushMetricTelemetry: #{errorStr}" + @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @@ -123,6 +145,13 @@ def flushMetricTelemetry def filter(tag, time, record) begin if @process_incoming_stream + + # Check if insights metrics for PV metrics + data_type = record["DataType"] + if data_type == "INSIGHTS_METRICS_BLOB" + return filterPVInsightsMetrics(record) + end + object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] percentage_metric_value = 0.0 @@ -204,6 +233,47 @@ def filter(tag, time, record) end end + def filterPVInsightsMetrics(record) + begin + mdmMetrics = [] + record["DataItems"].each do |dataItem| + + if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + computer = dataItem["Computer"] + resourceDimensions = dataItem["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] + + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + computer, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + rescue Exception => e + @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [] #return empty array if we ran into any errors + end + end + def ensure_cpu_memory_capacity_set if @cpu_capacity != 0.0 && @memory_capacity != 0.0 @log.info "CPU And Memory Capacity are already set" diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index a44365e9d..b706ff00a 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -88,6 +88,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 38868f2f5..4e90195e5 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -101,6 +101,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end From 7304a6b32652a870087ac39f49b640bca85da1c1 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 23 Sep 2020 13:00:03 -0700 Subject: [PATCH 024/194] add new custom metric regions (#444) * add new custom metric regions * fix commas --- build/linux/installer/conf/container.conf | 4 ++-- build/linux/installer/conf/kube.conf | 6 +++--- .../templates/omsagent-rs-configmap.yaml | 6 +++--- kubernetes/omsagent.yaml | 6 +++--- scripts/troubleshoot/TroubleshootError.ps1 | 12 +++++++++++- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index e55c62fbc..f7e6e1da9 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -45,14 +45,14 @@ #custom_metrics_mdm filter plugin type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes log_level info type filter_telegraf2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level debug diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index ba40b7a35..dbb4db0da 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -13,7 +13,7 @@ tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth #Kubernetes events @@ -66,14 +66,14 @@ type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml index ee0664495..475b17a46 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml @@ -18,7 +18,7 @@ data: tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth #Kubernetes events @@ -70,14 +70,14 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level info # custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 5cda4dcb3..9c8f9de14 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -64,7 +64,7 @@ data: tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth #Kubernetes events @@ -117,14 +117,14 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/scripts/troubleshoot/TroubleshootError.ps1 b/scripts/troubleshoot/TroubleshootError.ps1 index 754a43e74..4c2d95ac6 100644 --- a/scripts/troubleshoot/TroubleshootError.ps1 +++ b/scripts/troubleshoot/TroubleshootError.ps1 @@ -234,7 +234,17 @@ $MdmCustomMetricAvailabilityLocations = ( 'eastasia', 'centralindia', 'uksouth', - 'canadacentral' + 'canadacentral', + 'francecentral', + 'japaneast', + 'australiaeast', + 'eastus2', + 'westus', + 'australiasoutheast', + 'brazilsouth', + 'germanywestcentral', + 'northcentralus', + 'switzerlandnorth' ); try { From 2d8c03fec9edc15da7df5a14b9b5d561b4e85add Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 23 Sep 2020 13:01:07 -0700 Subject: [PATCH 025/194] add 'Terminating' state (#443) --- source/plugins/ruby/constants.rb | 3 +++ source/plugins/ruby/in_kube_podinventory.rb | 3 +++ 2 files changed, 6 insertions(+) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 82a6e8814..a64a4c97c 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -85,4 +85,7 @@ class Constants TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" + + #Pod Statuses + POD_STATUS_TERMINATING = "Terminating" end \ No newline at end of file diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bffa725ee..4880d80e7 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -194,6 +194,9 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi if podReadyCondition == false record["PodStatus"] = "Unknown" + # ICM - https://portal.microsofticm.com/imp/v3/incidents/details/187091803/home + elsif !items["metadata"]["deletionTimestamp"].nil? && !items["metadata"]["deletionTimestamp"].empty? + record["PodStatus"] = Constants::POD_STATUS_TERMINATING else record["PodStatus"] = items["status"]["phase"] end From da06d760ccb324e034a84187a3766c89d6bffb02 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 25 Sep 2020 12:36:27 -0700 Subject: [PATCH 026/194] Gangams/sept agent release tasks (#445) * turnoff mdm nonsupported cluster types * enable validation of server cert for ai ruby http client * add kubelet operations total and total error metrics * node selector label change * label update * wip * wip * wip * revert quotes --- build/linux/installer/conf/telegraf.conf | 9 +++---- .../templates/omsagent-daemonset-windows.yaml | 5 ++++ charts/azuremonitor-containers/values.yaml | 26 +++++++++++++++++++ kubernetes/linux/main.sh | 9 +++---- kubernetes/omsagent.yaml | 3 ++- .../channel/sender_base.rb | 4 +-- source/plugins/ruby/out_mdm.rb | 9 +++++-- 7 files changed, 50 insertions(+), 15 deletions(-) diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 013aa1af2..202ac9741 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -632,8 +632,7 @@ name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. urls = ["$CADVISOR_METRICS_URL"] - ## Include "$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC" when we add for support for 1.18 - fieldpass = ["$KUBELET_RUNTIME_OPERATIONS_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC"] + fieldpass = ["$KUBELET_RUNTIME_OPERATIONS_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC", "$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC", "$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC"] metric_version = 2 url_tag = "scrapeUrl" @@ -675,7 +674,7 @@ name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. urls = ["$CADVISOR_METRICS_URL"] - + fieldpass = ["kubelet_running_pod_count","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] metric_version = 2 @@ -690,7 +689,7 @@ ## Optional TLS Config tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" insecure_skip_verify = true - + ## prometheus custom metrics [[inputs.prometheus]] @@ -731,7 +730,7 @@ #name_prefix="container.azm.ms/" ## An array of urls to scrape metrics from. urls = $AZMON_INTEGRATION_NPM_METRICS_URL_LIST_NODE - + metric_version = 2 url_tag = "scrapeUrl" diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 7acd46c37..72b09f6c1 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -25,8 +25,13 @@ spec: dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" spec: +{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} + nodeSelector: + kubernetes.io/os: windows +{{- else }} nodeSelector: beta.kubernetes.io/os: windows +{{- end }} {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 9c48cf9fb..1804d1197 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -56,6 +56,17 @@ omsagent: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - labelSelector: + matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - key: type + operator: NotIn + values: + - virtual-kubelet nodeSelectorTerms: - labelSelector: matchExpressions: @@ -71,6 +82,21 @@ omsagent: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - labelSelector: + matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - key: type + operator: NotIn + values: + - virtual-kubelet + - key: kubernetes.io/role + operator: NotIn + values: + - master nodeSelectorTerms: - labelSelector: matchExpressions: diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index d9fdc42e9..11972f0f4 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -300,11 +300,10 @@ fi echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc -# enable these metrics in next agent release -# export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" -# echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc -# export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" -# echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc +export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" +echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc +export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" +echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc # default to docker metrics export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations" diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 9c8f9de14..09e50b5a4 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -419,7 +419,8 @@ spec: nodeSelectorTerms: - labelSelector: matchExpressions: - - key: beta.kubernetes.io/os + # kubernetes.io/os label doesnt exist in k8s versions < 1.14 so make sure to choose label based on k8s version in aks yaml + - key: kubernetes.io/os operator: In values: - linux diff --git a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb index 33ac49286..bedbae4ee 100644 --- a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb +++ b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb @@ -66,12 +66,12 @@ def send(data_to_send) request.body = compressed_data if @proxy.nil? || @proxy.empty? http = Net::HTTP.new uri.hostname, uri.port - else + else http = Net::HTTP.new(uri.hostname, uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass]) end if uri.scheme.downcase == 'https' http.use_ssl = true - http.verify_mode = OpenSSL::SSL::VERIFY_NONE + http.verify_mode = OpenSSL::SSL::VERIFY_PEER end response = http.request(request) diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index c4cc46dd7..1c805255a 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -61,15 +61,17 @@ def configure(conf) def start super begin - file = File.read(@@azure_json_path) - @data_hash = JSON.parse(file) aks_resource_id = ENV["AKS_RESOURCE_ID"] aks_region = ENV["AKS_REGION"] if aks_resource_id.to_s.empty? @log.info "Environment Variable AKS_RESOURCE_ID is not set.. " @can_send_data_to_mdm = false + elsif !aks_resource_id.downcase.include?("/microsoft.containerservice/managedclusters/") && !aks_resource_id.downcase.include?("/microsoft.kubernetes/connectedclusters/") + @log.info "MDM Metris not supported for this cluster type resource: #{aks_resource_id}" + @can_send_data_to_mdm = false end + if aks_region.to_s.empty? @log.info "Environment Variable AKS_REGION is not set.. " @can_send_data_to_mdm = false @@ -106,6 +108,9 @@ def start @cluster_identity = ArcK8sClusterIdentity.new @cached_access_token = @cluster_identity.get_cluster_identity_token else + # azure json file only used for aks and doesnt exist in non-azure envs + file = File.read(@@azure_json_path) + @data_hash = JSON.parse(file) # Check to see if SP exists, if it does use SP. Else, use msi sp_client_id = @data_hash["aadClientId"] sp_client_secret = @data_hash["aadClientSecret"] From 545305438d54d44c5d3b02cd075019eb57617a48 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 28 Sep 2020 11:36:38 -0700 Subject: [PATCH 027/194] grwehner/pv-collect-volume-name (#448) Collect and send the volume name as another tag for pvUsedBytes in InsightsMetrics, so that it can be displayed in the workload workbook. Does not affect the PV MDM metric --- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 1 + source/plugins/ruby/constants.rb | 1 + 2 files changed, 2 insertions(+) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 7661bb7a1..9e0935480 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -359,6 +359,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = pvcNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = volume["name"] metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index a64a4c97c..73e3af471 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -19,6 +19,7 @@ class Constants INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" INSIGHTSMETRICS_TAGS_POD_NAME = "podName" INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" + INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" REASON_OOM_KILLED = "oomkilled" #Kubestate (common) From fe9f14df60f8d9a0cc52d33ad13c8c05b0c76cbb Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 29 Sep 2020 17:34:30 -0700 Subject: [PATCH 028/194] Changes for september agent release (#449) Moving from v1beta1 to v1 for health CRD Adding timer for zero filling Adding zero filling for PV metrics --- .../templates/omsagent-crd.yaml | 24 ++++++ kubernetes/omsagent.yaml | 14 +++- source/plugins/ruby/MdmMetricsGenerator.rb | 77 ++++++++++++------- source/plugins/ruby/constants.rb | 63 +++++++-------- 4 files changed, 116 insertions(+), 62 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-crd.yaml index f4a028bd3..bbaf89a52 100644 --- a/charts/azuremonitor-containers/templates/omsagent-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-crd.yaml @@ -1,3 +1,4 @@ +{{- if semverCompare "<1.19-0" .Capabilities.KubeVersion.GitVersion }} apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: @@ -10,3 +11,26 @@ spec: names: plural: healthstates kind: HealthState +{{- else }} +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: healthstates.azmon.container.insights + namespace: kube-system +spec: + group: azmon.container.insights + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + state: + type: string + scope: Namespaced + names: + plural: healthstates + kind: HealthState +{{- end }} \ No newline at end of file diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 09e50b5a4..e8352e020 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -746,14 +746,24 @@ spec: port: 25227 targetPort: in-rs-tcp --- -apiVersion: apiextensions.k8s.io/v1beta1 +# this is for versions >=1.19, for versions <1.19 we continue to use v1beta1 +apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: healthstates.azmon.container.insights namespace: kube-system spec: group: azmon.container.insights - version: v1 + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + state: + type: string scope: Namespaced names: plural: healthstates diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 1e7db37cc..b8104212d 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -8,9 +8,11 @@ class MdmMetricsGenerator require_relative "MdmAlertTemplates" require_relative "ApplicationInsightsUtility" require_relative "constants" + require_relative "oms_common" @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" @log = Logger.new(@log_path, 1, 5000000) + @@hostName = (OMS::Common.get_hostname) @oom_killed_container_count_hash = {} @container_restart_count_hash = {} @@ -38,11 +40,12 @@ class MdmMetricsGenerator } @@pod_metric_name_metric_percentage_name_hash = { - Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC + Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC, } # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @sendZeroFilledMetrics = true + @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i def initialize end @@ -179,6 +182,19 @@ def zeroFillMetricRecords(records, batch_time) if !containerMemoryWorkingSetRecord.nil? && !containerMemoryWorkingSetRecord.empty? && !containerMemoryWorkingSetRecord[0].nil? && !containerMemoryWorkingSetRecord[0].empty? records.push(containerMemoryWorkingSetRecord[0]) end + + pvZeroFillDims = {} + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL + pvResourceUtilMetricRecord = getPVResourceUtilMetricRecords(batch_time, + Constants::PV_USED_BYTES, + @@hostName, + 0, + pvZeroFillDims, + metric_threshold_hash[Constants::PV_USED_BYTES]) + if !pvResourceUtilMetricRecord.nil? && !pvResourceUtilMetricRecord.empty? && !pvResourceUtilMetricRecord[0].nil? && !pvResourceUtilMetricRecord[0].empty? + records.push(pvResourceUtilMetricRecord[0]) + end rescue => errorStr @log.info "Error in zeroFillMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -189,10 +205,13 @@ def zeroFillMetricRecords(records, batch_time) def appendAllPodMetrics(records, batch_time) begin @log.info "in appendAllPodMetrics..." - if @sendZeroFilledMetrics == true + timeDifference = (DateTime.now.to_time.to_i - @zeroFilledMetricsTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if @sendZeroFilledMetrics == true || (timeDifferenceInMinutes >= Constants::ZERO_FILL_METRICS_INTERVAL_IN_MINUTES) records = zeroFillMetricRecords(records, batch_time) # Setting it to false after startup @sendZeroFilledMetrics = false + @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i end records = appendPodMetrics(records, Constants::MDM_OOM_KILLED_CONTAINER_COUNT, @@ -325,22 +344,22 @@ def getMetricRecords(record) begin dimNames = String.new "" #mutable string dimValues = String.new "" - noDimVal ="-" + noDimVal = "-" metricValue = 0 if !record["tags"].nil? - dimCount = 0 - record["tags"].each { |k, v| - dimCount = dimCount+1 - if (dimCount <= 10) #MDM = 10 dims - dimNames.concat("\"#{k}\"") - dimNames.concat(",") - if !v.nil? && v.length >0 - dimValues.concat("\"#{v}\"") - else - dimValues.concat("\"#{noDimVal}\"") - end - dimValues.concat(",") + dimCount = 0 + record["tags"].each { |k, v| + dimCount = dimCount + 1 + if (dimCount <= 10) #MDM = 10 dims + dimNames.concat("\"#{k}\"") + dimNames.concat(",") + if !v.nil? && v.length > 0 + dimValues.concat("\"#{v}\"") + else + dimValues.concat("\"#{noDimVal}\"") end + dimValues.concat(",") + end } if (dimNames.end_with?(",")) dimNames.chomp!(",") @@ -353,19 +372,19 @@ def getMetricRecords(record) convertedTimestamp = Time.at(timestamp.to_i).utc.iso8601 if !record["fields"].nil? record["fields"].each { |k, v| - if is_numeric(v) - metricRecord = MdmAlertTemplates::Generic_metric_template % { - timestamp: convertedTimestamp, - metricName: k, - namespaceSuffix: record["name"], - dimNames: dimNames, - dimValues: dimValues, - metricValue: v, - } - records.push(Yajl::Parser.parse(StringIO.new(metricRecord))) - #@log.info "pushed mdmgenericmetric: #{k},#{v}" - end - } + if is_numeric(v) + metricRecord = MdmAlertTemplates::Generic_metric_template % { + timestamp: convertedTimestamp, + metricName: k, + namespaceSuffix: record["name"], + dimNames: dimNames, + dimValues: dimValues, + metricValue: v, + } + records.push(Yajl::Parser.parse(StringIO.new(metricRecord))) + #@log.info "pushed mdmgenericmetric: #{k},#{v}" + end + } end rescue => errorStr @log.info "getMetricRecords:Error: #{errorStr} for record #{record}" @@ -375,7 +394,7 @@ def getMetricRecords(record) end def is_numeric(o) - true if Float(o) rescue false + true if Float(o) rescue false end def getContainerResourceUtilizationThresholds diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 73e3af471..be1a9de64 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -57,36 +57,37 @@ class Constants MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" - CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 - OBJECT_NAME_K8S_CONTAINER = "K8SContainer" - OBJECT_NAME_K8S_NODE = "K8SNode" - CPU_USAGE_NANO_CORES = "cpuUsageNanoCores" - CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" - MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes" - MEMORY_RSS_BYTES = "memoryRssBytes" - PV_USED_BYTES = "pvUsedBytes" - DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 - DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 - DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 - DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 - CONTROLLER_KIND_JOB = "job" - CONTAINER_TERMINATION_REASON_COMPLETED = "completed" - CONTAINER_STATE_TERMINATED = "terminated" - STALE_JOB_TIME_IN_MINUTES = 360 - TELEGRAF_DISK_METRICS = "container.azm.ms/disk" - OMSAGENT_ZERO_FILL = "omsagent" - KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" + CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 + OBJECT_NAME_K8S_CONTAINER = "K8SContainer" + OBJECT_NAME_K8S_NODE = "K8SNode" + CPU_USAGE_NANO_CORES = "cpuUsageNanoCores" + CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" + MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes" + MEMORY_RSS_BYTES = "memoryRssBytes" + PV_USED_BYTES = "pvUsedBytes" + DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 + DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 + DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 + CONTROLLER_KIND_JOB = "job" + CONTAINER_TERMINATION_REASON_COMPLETED = "completed" + CONTAINER_STATE_TERMINATED = "terminated" + STALE_JOB_TIME_IN_MINUTES = 360 + TELEGRAF_DISK_METRICS = "container.azm.ms/disk" + OMSAGENT_ZERO_FILL = "omsagent" + KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" - #Telemetry constants - CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" - POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" - CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" - PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" - PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" - TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 - KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 - MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" + #Telemetry constants + CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" + POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" + CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" + PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" + PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" + TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 + KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 + ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30 + MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" - #Pod Statuses - POD_STATUS_TERMINATING = "Terminating" -end \ No newline at end of file + #Pod Statuses + POD_STATUS_TERMINATING = "Terminating" +end From f1657c65f2408bfd66a45cfa54c2d8a27770ac6a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 30 Sep 2020 18:13:25 -0700 Subject: [PATCH 029/194] Gangams/arc k8s related scripts, charts and doc updates (#450) * checksum annotations * script update for chart from mcr * chart updates * update chart version to match with chart release * script updates * latest chart updates * version updates for chart release * script updates * script updates * doc updates * doc updates * update comments * fix bug in ps script * fix bug in ps script * minor update * release process updates * use consistent name across scripts * use consistent names --- ....sh => push-helm-chart-to-canary-repos.sh} | 24 +- .pipelines/push-helm-chart-to-prod-repos.sh | 53 ++ ReleaseProcess.md | 5 +- charts/azuremonitor-containers/Chart.yaml | 2 +- .../templates/omsagent-daemonset-windows.yaml | 2 + .../templates/omsagent-daemonset.yaml | 3 + .../templates/omsagent-deployment.yaml | 3 + charts/azuremonitor-containers/values.yaml | 6 +- .../onboarding/managed/disable-monitoring.ps1 | 12 +- .../onboarding/managed/disable-monitoring.sh | 10 +- .../onboarding/managed/enable-monitoring.ps1 | 118 ++-- .../onboarding/managed/enable-monitoring.sh | 552 +++++++++--------- .../onboarding/managed/upgrade-monitoring.sh | 314 ++++++++++ 13 files changed, 733 insertions(+), 371 deletions(-) rename .pipelines/{push-helm-chart-as-oci-artifact.sh => push-helm-chart-to-canary-repos.sh} (54%) create mode 100644 .pipelines/push-helm-chart-to-prod-repos.sh create mode 100644 scripts/onboarding/managed/upgrade-monitoring.sh diff --git a/.pipelines/push-helm-chart-as-oci-artifact.sh b/.pipelines/push-helm-chart-to-canary-repos.sh similarity index 54% rename from .pipelines/push-helm-chart-as-oci-artifact.sh rename to .pipelines/push-helm-chart-to-canary-repos.sh index 50e16e3d0..db8bff56e 100644 --- a/.pipelines/push-helm-chart-as-oci-artifact.sh +++ b/.pipelines/push-helm-chart-to-canary-repos.sh @@ -1,8 +1,9 @@ #!/bin/bash -# push the helm chart as an OCI artifact to specified ACR # working directory of this script should be charts/azuremonitor-containers -export REPO_PATH="batch1/test/azure-monitor-containers" +# note: this repo registered in arc k8s extension for canary region +export REPO_PATH="public/azuremonitor/containerinsights/canary/preview/azuremonitor-containers" + export HELM_EXPERIMENTAL_OCI=1 for ARGUMENT in "$@" @@ -11,13 +12,13 @@ do VALUE=$(echo $ARGUMENT | cut -f2 -d=) case "$KEY" in - CIARCACR) CIARCACR=$VALUE ;; + CIACR) CIACR=$VALUE ;; CICHARTVERSION) CHARTVERSION=$VALUE ;; *) esac done -echo "CI ARC K8S ACR: ${CIARCACR}" +echo "CI ARC K8S ACR: ${CIACR}" echo "CI HELM CHART VERSION: ${CHARTVERSION}" echo "start: read appid and appsecret" @@ -25,18 +26,19 @@ ACR_APP_ID=$(cat ~/acrappid) ACR_APP_SECRET=$(cat ~/acrappsecret) echo "end: read appid and appsecret" -ACR=${CIARCACR} +ACR=${CIACR} + +echo "login to acr:${ACR} using helm" +helm registry login $ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to acr:${ACR} using oras" -oras login $ACR --username $ACR_APP_ID --password $ACR_APP_SECRET echo "login to acr:${ACR} completed: ${ACR}" echo "start: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" -echo "generate helm package" -helm package . +echo "save the chart locally with acr full path" +helm chart save . ${ACR}/${REPO_PATH}:${CHARTVERSION} -echo "pushing the helm chart as an OCI artifact" -oras push ${ACR}/${REPO_PATH}:${CHARTVERSION} --manifest-config /dev/null:application/vnd.unknown.config.v1+json ./azuremonitor-containers-${CHARTVERSION}.tgz:application/tar+gzip +echo "pushing the helm chart to ACR: ${ACR}" +helm chart push ${ACR}/${REPO_PATH}:${CHARTVERSION} echo "end: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" diff --git a/.pipelines/push-helm-chart-to-prod-repos.sh b/.pipelines/push-helm-chart-to-prod-repos.sh new file mode 100644 index 000000000..71aa989de --- /dev/null +++ b/.pipelines/push-helm-chart-to-prod-repos.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# working directory of this script should be charts/azuremonitor-containers + +# this repo used without extension public preview release +export PROD_REPO_PATH="public/azuremonitor/containerinsights/preview/azuremonitor-containers" + +# note: this repo registered in arc k8s extension for prod group1 regions. +export EXTENSION_PROD_REPO_PATH="public/azuremonitor/containerinsights/prod1/preview/azuremonitor-containers" + +export HELM_EXPERIMENTAL_OCI=1 + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + CIACR) CIACR=$VALUE ;; + CICHARTVERSION) CHARTVERSION=$VALUE ;; + *) + esac +done + +echo "CI ARC K8S ACR: ${CIACR}" +echo "CI HELM CHART VERSION: ${CHARTVERSION}" + +echo "start: read appid and appsecret" +ACR_APP_ID=$(cat ~/acrappid) +ACR_APP_SECRET=$(cat ~/acrappsecret) +echo "end: read appid and appsecret" + +ACR=${CIACR} + +echo "login to acr:${ACR} using helm" +helm registry login $ACR --username $ACR_APP_ID --password $ACR_APP_SECRET + +echo "login to acr:${ACR} completed: ${ACR}" + +echo "start: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" + +echo "save the chart locally with acr full path: ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION}" +helm chart save . ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION} + +echo "save the chart locally with acr full path: ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION}" +helm chart save . ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION} + +echo "pushing the helm chart to ACR: ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION}" +helm chart push ${ACR}/${EXTENSION_PROD_REPO_PATH}:${CHARTVERSION} + +echo "pushing the helm chart to ACR: ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION}" +helm chart push ${ACR}/${PROD_REPO_PATH}:${CHARTVERSION} + +echo "end: push the chart version: ${CHARTVERSION} to acr repo: ${ACR}" diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 19802e22c..2a3e6001a 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -45,7 +45,10 @@ Make PR against [AKS-Engine](https://github.com/Azure/aks-engine). Refer PR http ## ARO v4, On-prem K8s, Azure Arc K8s and OpenShift v4 clusters -Make PR against [HELM-charts](https://github.com/helm/charts) with Azure Monitor for containers chart update. +Make sure azuremonitor-containers chart yamls updates with all changes going with the release and also make sure to bump the chart version, imagetag and docker provider version etc. Similar to agent container image, build pipeline automatically push the chart to container insights prod acr for canary and prod repos accordingly. +Both the agent and helm chart will be replicated to `mcr.microsoft.com`. + +The way, customers will be onboard the monitoring to these clusters using onboarding scripts under `onboarding\managed` directory so please bump chart version for prod release. Once we move to Arc K8s Monitoring extension Public preview, these will be taken care so at that point of time no manual changes like this required. # 4. Monitor agent roll-out status diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 8976b5561..1d3fed86f 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.7.4 +version: 2.7.6 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 72b09f6c1..e65f9a98d 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -24,6 +24,8 @@ spec: agentVersion: {{ .Values.omsagent.image.tagWindows }} dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" + checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} + checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} spec: {{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} nodeSelector: diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 7514247a0..438294ce5 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -24,6 +24,9 @@ spec: agentVersion: {{ .Values.omsagent.image.tag }} dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" + checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} + checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} + checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 7d7ac7040..8609d25c9 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -25,6 +25,9 @@ spec: agentVersion: {{ .Values.omsagent.image.tag }} dockerProviderVersion: {{ .Values.omsagent.image.dockerProviderVersion }} schema-versions: "v1" + checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} + checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} + checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 1804d1197..2711cb372 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,10 +12,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod09162020" - tagWindows: "win-ciprod09162020" + tag: "ciprod09252020" + tagWindows: "win-ciprod09252020" pullPolicy: IfNotPresent - dockerProviderVersion: "10.0.0-5" + dockerProviderVersion: "10.0.0-6" agentVersion: "1.10.0.1" ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/scripts/onboarding/managed/disable-monitoring.ps1 b/scripts/onboarding/managed/disable-monitoring.ps1 index 8945f90b6..1c011bfff 100644 --- a/scripts/onboarding/managed/disable-monitoring.ps1 +++ b/scripts/onboarding/managed/disable-monitoring.ps1 @@ -1,12 +1,12 @@ <# .DESCRIPTION - Disables Azure Monitor for containers to monitoring enabled Azure Managed K8s cluster such as Azure Arc K8s, ARO v4 and AKS etc. + Disables Azure Monitor for containers to monitoring enabled Azure Managed K8s cluster such as Azure Arc enabled Kubernetes, ARO v4 and AKS etc. 1. Deletes the existing Azure Monitor for containers helm release 2. Deletes logAnalyticsWorkspaceResourceId tag on the provided Managed cluster .PARAMETER clusterResourceId - Id of the Azure Managed Cluster such as Azure ARC K8s, ARO v4 etc. + Id of the Azure Managed Cluster such as Azure Arc enabled Kubernetes, ARO v4 etc. .PARAMETER servicePrincipalClientId client Id of the service principal which will be used for the azure login .PARAMETER servicePrincipalClientSecret @@ -18,7 +18,7 @@ Pre-requisites: - Azure Managed cluster Resource Id - - Contributor role permission on the Subscription of the Azure Arc Cluster + - Contributor role permission on the Subscription of the Azure Arc enabled Kubernetes Cluster - Helm v3.0.0 or higher https://github.com/helm/helm/releases - kube-context of the K8s cluster Note: 1. Please make sure you have all the pre-requisistes before running this script. @@ -298,7 +298,7 @@ if ($isArcK8sCluster -eq $true) { # validate identity $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() if ($clusterIdentity.Contains("systemassigned") -eq $false) { - Write-Host("Identity of Azure Arc K8s cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red + Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red exit } } @@ -354,7 +354,3 @@ catch { } Write-Host("Successfully disabled Azure Monitor for containers for cluster: $clusteResourceId") -ForegroundColor Green - - - - diff --git a/scripts/onboarding/managed/disable-monitoring.sh b/scripts/onboarding/managed/disable-monitoring.sh index f20bd7d33..c11426f30 100644 --- a/scripts/onboarding/managed/disable-monitoring.sh +++ b/scripts/onboarding/managed/disable-monitoring.sh @@ -26,10 +26,10 @@ set -o pipefail # default release name used during onboarding releaseName="azmon-containers-release-1" -# resource type for azure arc clusters +# resource type for Azure Arc enabled Kubernetes clusters resourceProvider="Microsoft.Kubernetes/connectedClusters" -# resource provider for azure arc connected cluster +# resource provider for Azure Arc enabled Kubernetes cluster arcK8sResourceProvider="Microsoft.Kubernetes/connectedClusters" # resource provider for azure redhat openshift v4 cluster aroV4ResourceProvider="Microsoft.RedHatOpenShift/OpenShiftClusters" @@ -125,13 +125,13 @@ remove_monitoring_tags() echo "set the cluster subscription id: ${clusterSubscriptionId}" az account set -s ${clusterSubscriptionId} - # validate cluster identity for ARC k8s cluster + # validate cluster identity for Azure Arc enabled Kubernetes cluster if [ "$isArcK8sCluster" = true ] ; then identitytype=$(az resource show -g ${clusterResourceGroup} -n ${clusterName} --resource-type $resourceProvider --query identity.type) identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then - echo "-e only supported cluster identity is systemassigned for Azure ARC K8s cluster type" + echo "-e only supported cluster identity is systemassigned for Azure Arc enabled Kubernetes cluster type" exit 1 fi fi @@ -257,7 +257,7 @@ done # detect the resource provider from the provider name in the cluster resource id if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then - echo "provider cluster resource is of Azure ARC K8s cluster type" + echo "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" isArcK8sCluster=true resourceProvider=$arcK8sResourceProvider elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 338de6cbc..1e1669400 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -1,14 +1,14 @@ <# .DESCRIPTION - Onboards Azure Monitor for containers to Azure Managed Kuberenetes such as Azure Arc K8s, ARO v4 and AKS etc. + Onboards Azure Monitor for containers to Azure Managed Kuberenetes such as Azure Arc enabled Kubernetes, ARO v4 and AKS etc. 1. Creates the Default Azure log analytics workspace if doesn't exist one in specified subscription 2. Adds the ContainerInsights solution to the Azure log analytics workspace 3. Adds the workspaceResourceId tag or enable addon (if the cluster is AKS) on the provided Managed cluster resource id 4. Installs Azure Monitor for containers HELM chart to the K8s cluster in provided via --kube-context .PARAMETER clusterResourceId - Id of the Azure Managed Cluster such as Azure ARC K8s, ARO v4 etc. + Id of the Azure Managed Cluster such as Azure Arc enabled Kubernetes, ARO v4 etc. .PARAMETER servicePrincipalClientId Client Id of the service principal which will be used for the azure login .PARAMETER servicePrincipalClientSecret @@ -22,10 +22,6 @@ .PARAMETER proxyEndpoint (optional) Provide Proxy endpoint if you have K8s cluster behind the proxy and would like to route Azure Monitor for containers outbound traffic via proxy. Format of the proxy endpoint should be http(s://:@: - .PARAMETER helmRepoName (optional) - helm repo name. should be used only for the private preview features - .PARAMETER helmRepoUrl (optional) - helm repo url. should be used only for the private preview features Pre-requisites: - Azure Managed cluster Resource Id @@ -50,30 +46,23 @@ param( [Parameter(mandatory = $false)] [string]$workspaceResourceId, [Parameter(mandatory = $false)] - [string]$proxyEndpoint, - [Parameter(mandatory = $false)] - [string]$helmRepoName, - [Parameter(mandatory = $false)] - [string]$helmRepoUrl + [string]$proxyEndpoint ) -$solutionTemplateUri= "https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/onboarding/templates/azuremonitor-containerSolution.json" +$solutionTemplateUri = "https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/onboarding/templates/azuremonitor-containerSolution.json" $helmChartReleaseName = "azmon-containers-release-1" $helmChartName = "azuremonitor-containers" -$helmChartRepoName = "incubator" -$helmChartRepoUrl = "https://kubernetes-charts-incubator.storage.googleapis.com/" + # flags to indicate the cluster types $isArcK8sCluster = $false -$isAksCluster = $false +$isAksCluster = $false $isUsingServicePrincipal = $false -if([string]::IsNullOrEmpty($helmRepoName) -eq $false){ - $helmChartRepoName = $helmRepoName -} - -if([string]::IsNullOrEmpty($helmRepoUrl) -eq $false){ - $helmChartRepoUrl = $helmRepoUrl -} +# released chart version in mcr +$mcr = "mcr.microsoft.com" +$mcrChartVersion = "2.7.6" +$mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" +$helmLocalRepoName = "." # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts @@ -200,7 +189,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } if ([string]::IsNullOrEmpty($clusterResourceId)) { - Write-Host("Specified Azure Arc ClusterResourceId should not be NULL or empty") -ForegroundColor Red + Write-Host("Specified Azure Arc enabled Kubernetes ClusterResourceId should not be NULL or empty") -ForegroundColor Red exit } @@ -220,30 +209,31 @@ if ($clusterResourceId.StartsWith("/") -eq $false) { $clusterResourceId = "/" + $clusterResourceId } -if ($clusterResourceId.Split("/").Length -ne 9){ - Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red +if ($clusterResourceId.Split("/").Length -ne 9) { + Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red exit } if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -ne $true) -and ($clusterResourceId.ToLower().Contains("microsoft.redhatopenshift/openshiftclusters") -ne $true) -and ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -ne $true) - ) { +) { Write-Host("Provided cluster ResourceId is not supported cluster type: $clusterResourceId") -ForegroundColor Red exit } -if(([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and - ([string]::IsNullOrEmpty($servicePrincipalClientSecret) -eq $false) -and - ([string]::IsNullOrEmpty($tenantId) -eq $false)) { - Write-Host("Using service principal creds for the azure login since these provided.") - $isUsingServicePrincipal = $true +if (([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and + ([string]::IsNullOrEmpty($servicePrincipalClientSecret) -eq $false) -and + ([string]::IsNullOrEmpty($tenantId) -eq $false)) { + Write-Host("Using service principal creds for the azure login since these provided.") + $isUsingServicePrincipal = $true } if ($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -eq $true) { - $isArcK8sCluster = $true -} elseif ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -eq $true) { - $isAksCluster = $true + $isArcK8sCluster = $true +} +elseif ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -eq $true) { + $isAksCluster = $true } $resourceParts = $clusterResourceId.Split("/") @@ -253,7 +243,7 @@ Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -Foregr if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force - $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId, $spSecret Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId } @@ -275,12 +265,13 @@ if ($null -eq $account.Account) { try { if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force - $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret + $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId, $spSecret Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId - } else { - Write-Host("Please login...") - Connect-AzAccount -subscriptionid $clusterSubscriptionId - } + } + else { + Write-Host("Please login...") + Connect-AzAccount -subscriptionid $clusterSubscriptionId + } } catch { Write-Host("") @@ -322,12 +313,12 @@ if ($null -eq $clusterResource) { $clusterRegion = $clusterResource.Location.ToLower() if ($isArcK8sCluster -eq $true) { - # validate identity - $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() - if ($clusterIdentity.contains("systemassigned") -eq $false) { - Write-Host("Identity of Azure Arc K8s cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red - exit - } + # validate identity + $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() + if ($clusterIdentity.contains("systemassigned") -eq $false) { + Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red + exit + } } if ([string]::IsNullOrEmpty($workspaceResourceId)) { @@ -514,7 +505,8 @@ if ($account.Subscription.Id -eq $clusterSubscriptionId) { if ($isAksCluster -eq $true) { Write-Host ("Enabling AKS Monitoring Addon ..") # TBD -} else { +} +else { Write-Host("Attaching workspaceResourceId tag on the cluster ResourceId") $clusterResource.Tags["logAnalyticsWorkspaceResourceId"] = $WorkspaceInformation.ResourceId Set-AzResource -Tag $clusterResource.Tags -ResourceId $clusterResource.ResourceId -Force @@ -526,20 +518,30 @@ Write-Host "Helm version" : $helmVersion Write-Host("Installing or upgrading if exists, Azure Monitor for containers HELM chart ...") try { - Write-Host("Adding $helmChartRepoName repo to helm: $helmChartRepoUrl") - helm repo add $helmChartRepoName $helmChartRepoUrl - Write-Host("updating helm repo to get latest version of charts") - helm repo update + Write-Host("pull the chart from mcr.microsoft.com") + [System.Environment]::SetEnvironmentVariable("HELM_EXPERIMENTAL_OCI", 1, "Process") + + Write-Host("pull the chart from mcr.microsoft.com") + helm chart pull ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} + + Write-Host("export the chart from local cache to current directory") + helm chart export ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} --destination . + + $helmChartRepoPath = "${helmLocalRepoName}" + "/" + "${helmChartName}" + + Write-Host("helmChartRepoPath is : ${helmChartRepoPath}") + $helmParameters = "omsagent.secret.wsid=$workspaceGUID,omsagent.secret.key=$workspacePrimarySharedKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion" - if([string]::IsNullOrEmpty($proxyEndpoint) -eq $false) { + if ([string]::IsNullOrEmpty($proxyEndpoint) -eq $false) { Write-Host("using proxy endpoint since its provided") $helmParameters = $helmParameters + ",omsagent.proxy=$proxyEndpoint" } if ([string]::IsNullOrEmpty($kubeContext)) { - helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoName/$helmChartName - } else { - Write-Host("using provided kube-context: $kubeContext") - helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoName/$helmChartName --kube-context $kubeContext + helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoPath + } + else { + Write-Host("using provided kube-context: $kubeContext") + helm upgrade --install $helmChartReleaseName --set $helmParameters $helmChartRepoPath --kube-context $kubeContext } } catch { @@ -548,7 +550,3 @@ catch { Write-Host("Successfully enabled Azure Monitor for containers for cluster: $clusterResourceId") -ForegroundColor Green Write-Host("Proceed to https://aka.ms/azmon-containers to view your newly onboarded Azure Managed cluster") -ForegroundColor Green - - - - diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 226fd978b..ce62a581a 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -41,9 +41,11 @@ set -o pipefail # default to public cloud since only supported cloud is azure public clod defaultAzureCloud="AzureCloud" -# helm repo details -helmRepoName="incubator" -helmRepoUrl="https://kubernetes-charts-incubator.storage.googleapis.com/" +# released chart version in mcr +mcrChartVersion="2.7.6" +mcr="mcr.microsoft.com" +mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" +helmLocalRepoName="." helmChartName="azuremonitor-containers" # default release name used during onboarding @@ -58,19 +60,18 @@ aroV4ResourceProvider="Microsoft.RedHatOpenShift/OpenShiftClusters" # resource provider for aks cluster aksResourceProvider="Microsoft.ContainerService/managedClusters" -# default of resourceProvider is arc k8s and this will get updated based on the provider cluster resource +# default of resourceProvider is Azure Arc enabled Kubernetes and this will get updated based on the provider cluster resource resourceProvider="Microsoft.Kubernetes/connectedClusters" - # resource type for azure log analytics workspace workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" # openshift project name for aro v4 cluster openshiftProjectName="azure-monitor-for-containers" -# arc k8s cluster resource +# AROv4 cluster resource isAroV4Cluster=false -# arc k8s cluster resource +# Azure Arc enabled Kubernetes cluster resource isArcK8sCluster=false # aks cluster resource @@ -103,28 +104,25 @@ servicePrincipalClientSecret="" servicePrincipalTenantId="" isUsingServicePrincipal=false -usage() -{ - local basename=`basename $0` - echo - echo "Enable Azure Monitor for containers:" - echo "$basename --resource-id [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context ] [--workspace-id ] [--proxy ]" +usage() { + local basename=$(basename $0) + echo + echo "Enable Azure Monitor for containers:" + echo "$basename --resource-id [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context ] [--workspace-id ] [--proxy ]" } -parse_args() -{ +parse_args() { - if [ $# -le 1 ] - then + if [ $# -le 1 ]; then usage exit 1 - fi + fi -# Transform long options to short ones -for arg in "$@"; do - shift - case "$arg" in - "--resource-id") set -- "$@" "-r" ;; + # Transform long options to short ones + for arg in "$@"; do + shift + case "$arg" in + "--resource-id") set -- "$@" "-r" ;; "--kube-context") set -- "$@" "-k" ;; "--workspace-id") set -- "$@" "-w" ;; "--proxy") set -- "$@" "-p" ;; @@ -134,130 +132,128 @@ for arg in "$@"; do "--helm-repo-name") set -- "$@" "-n" ;; "--helm-repo-url") set -- "$@" "-u" ;; "--container-log-volume") set -- "$@" "-v" ;; - "--"*) usage ;; - *) set -- "$@" "$arg" - esac -done + "--"*) usage ;; + *) set -- "$@" "$arg" ;; + esac + done -local OPTIND opt + local OPTIND opt -while getopts 'hk:r:w:p:c:s:t:n:u:v:' opt; do + while getopts 'hk:r:w:p:c:s:t:n:u:v:' opt; do case "$opt" in - h) + h) + usage + ;; + + k) + kubeconfigContext="$OPTARG" + echo "name of kube-context is $OPTARG" + ;; + + r) + clusterResourceId="$OPTARG" + echo "clusterResourceId is $OPTARG" + ;; + + w) + workspaceResourceId="$OPTARG" + echo "workspaceResourceId is $OPTARG" + ;; + + p) + proxyEndpoint="$OPTARG" + echo "proxyEndpoint is $OPTARG" + ;; + + c) + servicePrincipalClientId="$OPTARG" + echo "servicePrincipalClientId is $OPTARG" + ;; + + s) + servicePrincipalClientSecret="$OPTARG" + echo "clientSecret is *****" + ;; + + t) + servicePrincipalTenantId="$OPTARG" + echo "service principal tenantId is $OPTARG" + ;; + + n) + helmRepoName="$OPTARG" + echo "helm repo name is $OPTARG" + ;; + + u) + helmRepoUrl="$OPTARG" + echo "helm repo url is $OPTARG" + ;; + + v) + containerLogVolume="$OPTARG" + echo "container log volume is $OPTARG" + ;; + + ?) usage - ;; - - k) - kubeconfigContext="$OPTARG" - echo "name of kube-context is $OPTARG" - ;; - - r) - clusterResourceId="$OPTARG" - echo "clusterResourceId is $OPTARG" - ;; - - w) - workspaceResourceId="$OPTARG" - echo "workspaceResourceId is $OPTARG" - ;; - - p) - proxyEndpoint="$OPTARG" - echo "proxyEndpoint is $OPTARG" - ;; - - c) - servicePrincipalClientId="$OPTARG" - echo "servicePrincipalClientId is $OPTARG" - ;; - - s) - servicePrincipalClientSecret="$OPTARG" - echo "clientSecret is *****" - ;; - - t) - servicePrincipalTenantId="$OPTARG" - echo "service principal tenantId is $OPTARG" - ;; - - n) - helmRepoName="$OPTARG" - echo "helm repo name is $OPTARG" - ;; - - u) - helmRepoUrl="$OPTARG" - echo "helm repo url is $OPTARG" - ;; - - v) - containerLogVolume="$OPTARG" - echo "container log volume is $OPTARG" - ;; - - ?) - usage - exit 1 - ;; + exit 1 + ;; esac done - shift "$(($OPTIND -1))" + shift "$(($OPTIND - 1))" + local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" + local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" - local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" - local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" + # get resource parts and join back to get the provider name + local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" + local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" + local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2})" - # get resource parts and join back to get the provider name - local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" - local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" - local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2} )" + local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" - local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" + # convert to lowercase for validation + providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") - # convert to lowercase for validation - providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") + echo "cluster SubscriptionId:" $subscriptionId + echo "cluster ResourceGroup:" $resourceGroup + echo "cluster ProviderName:" $providerName + echo "cluster Name:" $clusterName - echo "cluster SubscriptionId:" $subscriptionId - echo "cluster ResourceGroup:" $resourceGroup - echo "cluster ProviderName:" $providerName - echo "cluster Name:" $clusterName - - if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then + if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then echo "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" exit 1 - fi + fi - if [[ $providerName != microsoft.* ]]; then - echo "-e invalid azure cluster resource id format." - exit 1 - fi + if [[ $providerName != microsoft.* ]]; then + echo "-e invalid azure cluster resource id format." + exit 1 + fi - # detect the resource provider from the provider name in the cluster resource id - # detect the resource provider from the provider name in the cluster resource id - if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then - echo "provider cluster resource is of Azure ARC K8s cluster type" + # detect the resource provider from the provider name in the cluster resource id + if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then + echo "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" isArcK8sCluster=true resourceProvider=$arcK8sResourceProvider - elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then + elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then echo "provider cluster resource is of AROv4 cluster type" resourceProvider=$aroV4ResourceProvider isAroV4Cluster=true - elif [ $providerName = "microsoft.containerservice/managedclusters" ]; then + elif [ $providerName = "microsoft.containerservice/managedclusters" ]; then echo "provider cluster resource is of AKS cluster type" isAksCluster=true resourceProvider=$aksResourceProvider - else - echo "-e unsupported azure managed cluster type" - exit 1 - fi + else + echo "-e unsupported azure managed cluster type" + exit 1 + fi - if [ -z "$kubeconfigContext" ]; then + if [ -z "$kubeconfigContext" ]; then echo "using or getting current kube config context since --kube-context parameter not set " - fi + fi -if [ ! -z "$workspaceResourceId" ]; then + if [ ! -z "$workspaceResourceId" ]; then local workspaceSubscriptionId="$(echo $workspaceResourceId | cut -d'/' -f3)" local workspaceResourceGroup="$(echo $workspaceResourceId | cut -d'/' -f5)" local workspaceProviderName="$(echo $workspaceResourceId | cut -d'/' -f7)" @@ -269,13 +265,13 @@ if [ ! -z "$workspaceResourceId" ]; then echo "workspace ProviderName:" $workspaceName echo "workspace Name:" $workspaceName - if [[ $workspaceProviderName != microsoft.operationalinsights* ]]; then - echo "-e invalid azure log analytics resource id format." - exit 1 - fi -fi + if [[ $workspaceProviderName != microsoft.operationalinsights* ]]; then + echo "-e invalid azure log analytics resource id format." + exit 1 + fi + fi -if [ ! -z "$proxyEndpoint" ]; then + if [ ! -z "$proxyEndpoint" ]; then # Validate Proxy Endpoint URL # extract the protocol:// proto="$(echo $proxyEndpoint | grep :// | sed -e's,^\(.*://\).*,\1,g')" @@ -302,23 +298,21 @@ if [ ! -z "$proxyEndpoint" ]; then else echo "successfully validated provided proxy endpoint is valid and in expected format" fi -fi + fi -if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then - echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" - isUsingServicePrincipal=true -fi + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi } -configure_to_public_cloud() -{ +configure_to_public_cloud() { echo "Set AzureCloud as active cloud for az cli" az cloud set -n $defaultAzureCloud } -validate_cluster_identity() -{ +validate_cluster_identity() { echo "validating cluster identity" local rgName="$(echo ${1})" @@ -329,15 +323,14 @@ validate_cluster_identity() echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then - echo "-e only supported cluster identity is systemassigned for Azure ARC K8s cluster type" - exit 1 + echo "-e only supported cluster identity is systemassigned for Azure Arc enabled Kubernetes cluster type" + exit 1 fi echo "successfully validated the identity of the cluster" } -create_default_log_analytics_workspace() -{ +create_default_log_analytics_workspace() { # extract subscription from cluster resource id local subscriptionId="$(echo $clusterResourceId | cut -d'/' -f3)" @@ -348,73 +341,71 @@ create_default_log_analytics_workspace() # mapping fors for default Azure Log Analytics workspace declare -A AzureCloudLocationToOmsRegionCodeMap=( - [australiasoutheast]=ASE - [australiaeast]=EAU - [australiacentral]=CAU - [canadacentral]=CCA - [centralindia]=CIN - [centralus]=CUS - [eastasia]=EA - [eastus]=EUS - [eastus2]=EUS2 - [eastus2euap]=EAP - [francecentral]=PAR - [japaneast]=EJP - [koreacentral]=SE - [northeurope]=NEU - [southcentralus]=SCUS - [southeastasia]=SEA - [uksouth]=SUK - [usgovvirginia]=USGV - [westcentralus]=EUS - [westeurope]=WEU - [westus]=WUS - [westus2]=WUS2 + [australiasoutheast]=ASE + [australiaeast]=EAU + [australiacentral]=CAU + [canadacentral]=CCA + [centralindia]=CIN + [centralus]=CUS + [eastasia]=EA + [eastus]=EUS + [eastus2]=EUS2 + [eastus2euap]=EAP + [francecentral]=PAR + [japaneast]=EJP + [koreacentral]=SE + [northeurope]=NEU + [southcentralus]=SCUS + [southeastasia]=SEA + [uksouth]=SUK + [usgovvirginia]=USGV + [westcentralus]=EUS + [westeurope]=WEU + [westus]=WUS + [westus2]=WUS2 ) declare -A AzureCloudRegionToOmsRegionMap=( - [australiacentral]=australiacentral - [australiacentral2]=australiacentral - [australiaeast]=australiaeast - [australiasoutheast]=australiasoutheast - [brazilsouth]=southcentralus - [canadacentral]=canadacentral - [canadaeast]=canadacentral - [centralus]=centralus - [centralindia]=centralindia - [eastasia]=eastasia - [eastus]=eastus - [eastus2]=eastus2 - [francecentral]=francecentral - [francesouth]=francecentral - [japaneast]=japaneast - [japanwest]=japaneast - [koreacentral]=koreacentral - [koreasouth]=koreacentral - [northcentralus]=eastus - [northeurope]=northeurope - [southafricanorth]=westeurope - [southafricawest]=westeurope - [southcentralus]=southcentralus - [southeastasia]=southeastasia - [southindia]=centralindia - [uksouth]=uksouth - [ukwest]=uksouth - [westcentralus]=eastus - [westeurope]=westeurope - [westindia]=centralindia - [westus]=westus - [westus2]=westus2 + [australiacentral]=australiacentral + [australiacentral2]=australiacentral + [australiaeast]=australiaeast + [australiasoutheast]=australiasoutheast + [brazilsouth]=southcentralus + [canadacentral]=canadacentral + [canadaeast]=canadacentral + [centralus]=centralus + [centralindia]=centralindia + [eastasia]=eastasia + [eastus]=eastus + [eastus2]=eastus2 + [francecentral]=francecentral + [francesouth]=francecentral + [japaneast]=japaneast + [japanwest]=japaneast + [koreacentral]=koreacentral + [koreasouth]=koreacentral + [northcentralus]=eastus + [northeurope]=northeurope + [southafricanorth]=westeurope + [southafricawest]=westeurope + [southcentralus]=southcentralus + [southeastasia]=southeastasia + [southindia]=centralindia + [uksouth]=uksouth + [ukwest]=uksouth + [westcentralus]=eastus + [westeurope]=westeurope + [westindia]=centralindia + [westus]=westus + [westus2]=westus2 ) - if [ -n "${AzureCloudRegionToOmsRegionMap[$clusterRegion]}" ]; - then + if [ -n "${AzureCloudRegionToOmsRegionMap[$clusterRegion]}" ]; then workspaceRegion=${AzureCloudRegionToOmsRegionMap[$clusterRegion]} fi echo "Workspace Region:"$workspaceRegion - if [ -n "${AzureCloudLocationToOmsRegionCodeMap[$workspaceRegion]}" ]; - then + if [ -n "${AzureCloudLocationToOmsRegionCodeMap[$workspaceRegion]}" ]; then workspaceRegionCode=${AzureCloudLocationToOmsRegionCodeMap[$workspaceRegion]} fi echo "Workspace Region Code:"$workspaceRegionCode @@ -423,30 +414,28 @@ create_default_log_analytics_workspace() isRGExists=$(az group exists -g $workspaceResourceGroup) workspaceName="DefaultWorkspace-"$subscriptionId"-"$workspaceRegionCode - if $isRGExists - then echo "using existing default resource group:"$workspaceResourceGroup + if $isRGExists; then + echo "using existing default resource group:"$workspaceResourceGroup else echo "creating resource group: $workspaceResourceGroup in region: $workspaceRegion" az group create -g $workspaceResourceGroup -l $workspaceRegion fi - workspaceList=$(az resource list -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider) - if [ "$workspaceList" = "[]" ]; - then - # create new default workspace since no mapped existing default workspace - echo '{"location":"'"$workspaceRegion"'", "properties":{"sku":{"name": "standalone"}}}' > WorkspaceProps.json - cat WorkspaceProps.json - workspace=$(az resource create -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --is-full-object -p @WorkspaceProps.json) + workspaceList=$(az resource list -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider) + if [ "$workspaceList" = "[]" ]; then + # create new default workspace since no mapped existing default workspace + echo '{"location":"'"$workspaceRegion"'", "properties":{"sku":{"name": "standalone"}}}' >WorkspaceProps.json + cat WorkspaceProps.json + workspace=$(az resource create -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --is-full-object -p @WorkspaceProps.json) else echo "using existing default workspace:"$workspaceName fi - workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id) + workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id) workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') } -add_container_insights_solution() -{ +add_container_insights_solution() { local resourceId="$(echo ${1})" # extract resource group from workspace resource id @@ -456,10 +445,9 @@ add_container_insights_solution() solution=$(az deployment group create -g $resourceGroup --template-uri $solutionTemplateUri --parameters workspaceResourceId=$resourceId --parameters workspaceRegion=$workspaceRegion) } -get_workspace_guid_and_key() -{ +get_workspace_guid_and_key() { # extract resource parts from workspace resource id - local resourceId="$(echo ${1} | tr -d '"' )" + local resourceId="$(echo ${1} | tr -d '"')" local subId="$(echo ${resourceId} | cut -d'/' -f3)" local rgName="$(echo ${resourceId} | cut -d'/' -f5)" local wsName="$(echo ${resourceId} | cut -d'/' -f9)" @@ -474,11 +462,10 @@ get_workspace_guid_and_key() workspaceKey=$(echo $workspaceKey | tr -d '"') } -install_helm_chart() -{ +install_helm_chart() { - # get the config-context for ARO v4 cluster - if [ "$isAroV4Cluster" = true ] ; then + # get the config-context for ARO v4 cluster + if [ "$isAroV4Cluster" = true ]; then echo "getting config-context of ARO v4 cluster " echo "getting admin user creds for aro v4 cluster" adminUserName=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminUsername' -o tsv) @@ -490,83 +477,84 @@ install_helm_chart() oc new-project $openshiftProjectName echo "getting config-context of aro v4 cluster" kubeconfigContext=$(oc config current-context) - fi - - if [ -z "$kubeconfigContext" ]; then - echo "installing Azure Monitor for containers HELM chart on to the cluster and using current kube context ..." - else - echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." - fi - - echo "getting the region of the cluster" - clusterRegion=$(az resource show --ids ${clusterResourceId} --query location -o tsv) - echo "cluster region is : ${clusterRegion}" - - echo "adding helm repo:" $helmRepoName - helm repo add $helmRepoName $helmRepoUrl - - echo "updating helm repo to get latest charts" - helm repo update - - if [ ! -z "$proxyEndpoint" ]; then - echo "using proxy endpoint since proxy configuration passed in" - if [ -z "$kubeconfigContext" ]; then - echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName - else - echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} - fi - else - if [ -z "$kubeconfigContext" ]; then - echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName - else - echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install azmon-containers-release-1 --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmRepoName/$helmChartName --kube-context ${kubeconfigContext} - fi - fi - - echo "chart installation completed." + fi + + if [ -z "$kubeconfigContext" ]; then + echo "installing Azure Monitor for containers HELM chart on to the cluster and using current kube context ..." + else + echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." + fi + + echo "getting the region of the cluster" + clusterRegion=$(az resource show --ids ${clusterResourceId} --query location -o tsv) + echo "cluster region is : ${clusterRegion}" + + echo "pull the chart version ${mcrChartVersion} from ${mcr}/${mcrChartRepoPath}" + export HELM_EXPERIMENTAL_OCI=1 + helm chart pull $mcr/$mcrChartRepoPath:$mcrChartVersion + + echo "export the chart from local cache to current directory" + helm chart export $mcr/$mcrChartRepoPath:$mcrChartVersion --destination . + + helmChartRepoPath=$helmLocalRepoName/$helmChartName + + echo "helm chart repo path: ${helmChartRepoPath}" + + if [ ! -z "$proxyEndpoint" ]; then + echo "using proxy endpoint since proxy configuration passed in" + if [ -z "$kubeconfigContext" ]; then + echo "using current kube-context since --kube-context/-k parameter not passed in" + helm upgrade --install $releaseName --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath + else + echo "using --kube-context:${kubeconfigContext} since passed in" + helm upgrade --install $releaseName --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} + fi + else + if [ -z "$kubeconfigContext" ]; then + echo "using current kube-context since --kube-context/-k parameter not passed in" + helm upgrade --install $releaseName --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath + else + echo "using --kube-context:${kubeconfigContext} since passed in" + helm upgrade --install $releaseName --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} + fi + fi + + echo "chart installation completed." } -login_to_azure() -{ - if [ "$isUsingServicePrincipal" = true ] ; then - echo "login to the azure using provided service principal creds" - az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId +login_to_azure() { + if [ "$isUsingServicePrincipal" = true ]; then + echo "login to the azure using provided service principal creds" + az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId else echo "login to the azure interactively" az login --use-device-code fi } -set_azure_subscription() -{ - local subscriptionId="$(echo ${1})" - echo "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" - az account set -s ${subscriptionId} - echo "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" +set_azure_subscription() { + local subscriptionId="$(echo ${1})" + echo "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + az account set -s ${subscriptionId} + echo "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" } -attach_monitoring_tags() -{ +attach_monitoring_tags() { echo "attach loganalyticsworkspaceResourceId tag on to cluster resource" - status=$(az resource update --set tags.logAnalyticsWorkspaceResourceId=$workspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider) + status=$(az resource update --set tags.logAnalyticsWorkspaceResourceId=$workspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider) echo "$status" echo "successfully attached logAnalyticsWorkspaceResourceId tag on the cluster resource" } # enables aks monitoring addon for private preview and dont use this for aks prod -enable_aks_monitoring_addon() -{ - echo "getting cluster object" - clusterGetResponse=$(az rest --method get --uri $clusterResourceId?api-version=2020-03-01) - export jqquery=".properties.addonProfiles.omsagent.config.logAnalyticsWorkspaceResourceID=\"$workspaceResourceId\"" - echo $clusterGetResponse | jq $jqquery > putrequestbody.json - status=$(az rest --method put --uri $clusterResourceId?api-version=2020-03-01 --body @putrequestbody.json --headers Content-Type=application/json) - echo "status after enabling of aks monitoringa addon:$status" +enable_aks_monitoring_addon() { + echo "getting cluster object" + clusterGetResponse=$(az rest --method get --uri $clusterResourceId?api-version=2020-03-01) + export jqquery=".properties.addonProfiles.omsagent.config.logAnalyticsWorkspaceResourceID=\"$workspaceResourceId\"" + echo $clusterGetResponse | jq $jqquery >putrequestbody.json + status=$(az rest --method put --uri $clusterResourceId?api-version=2020-03-01 --body @putrequestbody.json --headers Content-Type=application/json) + echo "status after enabling of aks monitoringa addon:$status" } # parse and validate args @@ -587,9 +575,9 @@ login_to_azure # set the cluster subscription id as active sub for azure cli set_azure_subscription $clusterSubscriptionId -# validate cluster identity if its ARC k8s cluster -if [ "$isArcK8sCluster" = true ] ; then - validate_cluster_identity $clusterResourceGroup $clusterName +# validate cluster identity if its Azure Arc enabled Kubernetes cluster +if [ "$isArcK8sCluster" = true ]; then + validate_cluster_identity $clusterResourceGroup $clusterName fi if [ -z $workspaceResourceId ]; then @@ -598,7 +586,7 @@ if [ -z $workspaceResourceId ]; then else echo "using provided azure log analytics workspace:${workspaceResourceId}" workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') - workspaceSubscriptionId="$(echo ${workspaceResourceId} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]" )" + workspaceSubscriptionId="$(echo ${workspaceResourceId} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" workspaceResourceGroup="$(echo ${workspaceResourceId} | cut -d'/' -f5)" workspaceName="$(echo ${workspaceResourceId} | cut -d'/' -f9)" @@ -620,13 +608,13 @@ add_container_insights_solution $workspaceResourceId # get workspace guid and key get_workspace_guid_and_key $workspaceResourceId -if [ "$isClusterAndWorkspaceInSameSubscription" = false ] ; then +if [ "$isClusterAndWorkspaceInSameSubscription" = false ]; then echo "switch to cluster subscription id as active subscription for cli: ${clusterSubscriptionId}" set_azure_subscription $clusterSubscriptionId fi # attach monitoring tags on to cluster resource -if [ "$isAksCluster" = true ] ; then +if [ "$isAksCluster" = true ]; then enable_aks_monitoring_addon else attach_monitoring_tags diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh new file mode 100644 index 000000000..8a12b2f02 --- /dev/null +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -0,0 +1,314 @@ +#!/bin/bash +# +# Execute this directly in Azure Cloud Shell (https://shell.azure.com) by pasting (SHIFT+INS on Windows, CTRL+V on Mac or Linux) +# the following line (beginning with curl...) at the command prompt and then replacing the args: +# This scripts upgrades the existing Azure Monitor for containers release on Azure Arc enabled Kubernetes cluster +# +# 1. Upgrades existing Azure Monitor for containers release to the K8s cluster in provided via --kube-context +# Prerequisites : +# Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest +# Helm3 : https://helm.sh/docs/intro/install/ + +# download script +# curl -o enable-monitoring.sh -L https://aka.ms/upgrade-monitoring-bash-script +# 1. Using Service Principal for Azure Login +## bash upgrade-monitoring.sh --client-id --client-secret --tenant-id +# 2. Using Interactive device login +# bash upgrade-monitoring.sh --resource-id + +set -e +set -o pipefail + +# released chart version for Azure Arc enabled Kubernetes public preview +mcrChartVersion="2.7.6" +mcr="mcr.microsoft.com" +mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" + +# default to public cloud since only supported cloud is azure public clod +defaultAzureCloud="AzureCloud" +helmLocalRepoName="." +helmChartName="azuremonitor-containers" + +# default release name used during onboarding +releaseName="azmon-containers-release-1" + +# resource provider for azure arc connected cluster +arcK8sResourceProvider="Microsoft.Kubernetes/connectedClusters" + +# default of resourceProvider is Azure Arc enabled Kubernetes and this will get updated based on the provider cluster resource +resourceProvider="Microsoft.Kubernetes/connectedClusters" + +# Azure Arc enabled Kubernetes cluster resource +isArcK8sCluster=false + +# openshift project name for aro v4 cluster +openshiftProjectName="azure-monitor-for-containers" + +# Azure Arc enabled Kubernetes cluster resource +isAroV4Cluster=false + +# default global params +clusterResourceId="" +kubeconfigContext="" + +# default workspace region and code +workspaceRegion="eastus" +workspaceRegionCode="EUS" +workspaceResourceGroup="DefaultResourceGroup-"$workspaceRegionCode + +# default workspace guid and key +workspaceGuid="" +workspaceKey="" + +# sp details for the login if provided +servicePrincipalClientId="" +servicePrincipalClientSecret="" +servicePrincipalTenantId="" +isUsingServicePrincipal=false + +usage() { + local basename=$(basename $0) + echo + echo "Upgrade Azure Monitor for containers:" + echo "$basename --resource-id [--client-id ] [--client-secret ] [--tenant-id ] [--kube-context ]" +} + +parse_args() { + + if [ $# -le 1 ]; then + usage + exit 1 + fi + + # Transform long options to short ones + for arg in "$@"; do + shift + case "$arg" in + "--resource-id") set -- "$@" "-r" ;; + "--kube-context") set -- "$@" "-k" ;; + "--client-id") set -- "$@" "-c" ;; + "--client-secret") set -- "$@" "-s" ;; + "--tenant-id") set -- "$@" "-t" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" ;; + esac + done + + local OPTIND opt + + while getopts 'hk:r:c:s:t:' opt; do + case "$opt" in + h) + usage + ;; + + k) + kubeconfigContext="$OPTARG" + echo "name of kube-context is $OPTARG" + ;; + + r) + clusterResourceId="$OPTARG" + echo "clusterResourceId is $OPTARG" + ;; + + c) + servicePrincipalClientId="$OPTARG" + echo "servicePrincipalClientId is $OPTARG" + ;; + + s) + servicePrincipalClientSecret="$OPTARG" + echo "clientSecret is *****" + ;; + + t) + servicePrincipalTenantId="$OPTARG" + echo "service principal tenantId is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND - 1))" + + local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" + local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" + + # get resource parts and join back to get the provider name + local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" + local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" + local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2})" + + local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" + + # convert to lowercase for validation + providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") + + echo "cluster SubscriptionId:" $subscriptionId + echo "cluster ResourceGroup:" $resourceGroup + echo "cluster ProviderName:" $providerName + echo "cluster Name:" $clusterName + + if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then + echo "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" + exit 1 + fi + + if [[ $providerName != microsoft.* ]]; then + echo "-e invalid azure cluster resource id format." + exit 1 + fi + + # detect the resource provider from the provider name in the cluster resource id + if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then + echo "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" + isArcK8sCluster=true + resourceProvider=$arcK8sResourceProvider + elif [ $providerName = "microsoft.redhatopenshift/openshiftclusters" ]; then + echo "provider cluster resource is of AROv4 cluster type" + resourceProvider=$aroV4ResourceProvider + isAroV4Cluster=true + elif [ $providerName = "microsoft.containerservice/managedclusters" ]; then + echo "provider cluster resource is of AKS cluster type" + isAksCluster=true + resourceProvider=$aksResourceProvider + else + echo "-e unsupported azure managed cluster type" + exit 1 + fi + + if [ -z "$kubeconfigContext" ]; then + echo "using or getting current kube config context since --kube-context parameter not set " + fi + + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + echo "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi +} + +configure_to_public_cloud() { + echo "Set AzureCloud as active cloud for az cli" + az cloud set -n $defaultAzureCloud +} + +validate_cluster_identity() { + echo "validating cluster identity" + + local rgName="$(echo ${1})" + local clusterName="$(echo ${2})" + + local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type) + identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') + echo "cluster identity type:" $identitytype + + if [[ "$identitytype" != "systemassigned" ]]; then + echo "-e only supported cluster identity is systemassigned for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + + echo "successfully validated the identity of the cluster" +} + +validate_monitoring_tags() { + echo "get loganalyticsworkspaceResourceId tag on to cluster resource" + logAnalyticsWorkspaceResourceIdTag=$(az resource show --query tags.logAnalyticsWorkspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider) + echo "configured log analytics workspace: ${logAnalyticsWorkspaceResourceIdTag}" + echo "successfully got logAnalyticsWorkspaceResourceId tag on the cluster resource" + if [ -z "$logAnalyticsWorkspaceResourceIdTag" ]; then + echo "-e logAnalyticsWorkspaceResourceId doesnt exist on this cluster which indicates cluster not enabled for monitoring" + exit 1 + fi +} + + +upgrade_helm_chart_release() { + + # get the config-context for ARO v4 cluster + if [ "$isAroV4Cluster" = true ]; then + echo "getting config-context of ARO v4 cluster " + echo "getting admin user creds for aro v4 cluster" + adminUserName=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminUsername' -o tsv) + adminPassword=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminPassword' -o tsv) + apiServer=$(az aro show -g $clusterResourceGroup -n $clusterName --query apiserverProfile.url -o tsv) + echo "login to the cluster via oc login" + oc login $apiServer -u $adminUserName -p $adminPassword + echo "creating project azure-monitor-for-containers" + oc new-project $openshiftProjectName + echo "getting config-context of aro v4 cluster" + kubeconfigContext=$(oc config current-context) + fi + + if [ -z "$kubeconfigContext" ]; then + echo "installing Azure Monitor for containers HELM chart on to the cluster and using current kube context ..." + else + echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." + fi + + export HELM_EXPERIMENTAL_OCI=1 + + echo "pull the chart from ${mcr}/${mcrChartRepoPath}:${mcrChartVersion}" + helm chart pull ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} + + echo "export the chart from local cache to current directory" + helm chart export ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} --destination . + + helmChartRepoPath=$helmLocalRepoName/$helmChartName + + echo "upgrading the release: $releaseName to chart version : ${mcrChartVersion}" + helm get values $releaseName -o yaml | helm upgrade --install $releaseName $helmChartRepoPath -f - + echo "$releaseName got upgraded successfully." +} + +login_to_azure() { + if [ "$isUsingServicePrincipal" = true ]; then + echo "login to the azure using provided service principal creds" + az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId + else + echo "login to the azure interactively" + az login --use-device-code + fi +} + +set_azure_subscription() { + local subscriptionId="$(echo ${1})" + echo "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + az account set -s ${subscriptionId} + echo "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" +} + +# parse and validate args +parse_args $@ + +# configure azure cli for public cloud +configure_to_public_cloud + +# parse cluster resource id +clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" +clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" +providerName="$(echo $clusterResourceId | cut -d'/' -f7)" +clusterName="$(echo $clusterResourceId | cut -d'/' -f9)" + +# login to azure +login_to_azure + +# set the cluster subscription id as active sub for azure cli +set_azure_subscription $clusterSubscriptionId + +# validate cluster identity if its Azure Arc enabled Kubernetes cluster +if [ "$isArcK8sCluster" = true ]; then + validate_cluster_identity $clusterResourceGroup $clusterName +fi + +# validate the cluster has monitoring tags +validate_monitoring_tags + +# upgrade helm chart release +upgrade_helm_chart_release + +# portal link +echo "Proceed to https://aka.ms/azmon-containers to view health of your newly onboarded cluster" From e6dad8354e38efc1fdd9eafbb269aa9d9e26fefd Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 1 Oct 2020 14:08:31 -0700 Subject: [PATCH 030/194] Install CA certs from wireserver (#451) --- kubernetes/windows/main.ps1 | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index de82722ad..2e8659601 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -263,6 +263,27 @@ function Generate-Certificates { C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe } +function Bootstrap-CACertificates { + try { + # This is required when the root CA certs are different for some clouds. + $caCerts=Invoke-WebRequest 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing | ConvertFrom-Json + if (![string]::IsNullOrEmpty($caCerts)) { + $certificates = $caCerts.Certificates + for ($index = 0; $index -lt $certificates.Length ; $index++) { + $name=$certificates[$index].Name + $certificates[$index].CertBody > $name + Write-Host "name: $($name)" + Import-Certificate -FilePath .\$name -CertStoreLocation 'Cert:\LocalMachine\Root' -Verbose + } + } + } + catch { + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in Bootstrap-CACertificates..." + } +} + function Test-CertificatePath { $certLocation = $env:CI_CERT_LOCATION $keyLocation = $env:CI_KEY_LOCATION @@ -288,6 +309,14 @@ Start-Transcript -Path main.txt Remove-WindowsServiceIfItExists "fluentdwinaks" Set-EnvironmentVariables Start-FileSystemWatcher + +#Bootstrapping CA certs for non public clouds and AKS clusters +$aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID") +if (![string]::IsNullOrEmpty($aksResourceId) -and $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) +{ + Bootstrap-CACertificates +} + Generate-Certificates Test-CertificatePath Start-Fluent From 23397edf3764870dde9d7f4eef10f0842ae5adc6 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 1 Oct 2020 16:14:49 -0700 Subject: [PATCH 031/194] grwehner/pv-volume-name-in-mdm (#452) Add volume name for PV to mdm dimensions and zero fill it --- source/plugins/ruby/MdmAlertTemplates.rb | 2 ++ source/plugins/ruby/MdmMetricsGenerator.rb | 3 +++ source/plugins/ruby/constants.rb | 1 + 3 files changed, 6 insertions(+) diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index d5107fea1..ef63cf219 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -101,6 +101,7 @@ class MdmAlertTemplates "podName", "node", "kubernetesNamespace", + "volumeName", "thresholdPercentage" ], "series": [ @@ -109,6 +110,7 @@ class MdmAlertTemplates "%{podNameDimValue}", "%{computerNameDimValue}", "%{namespaceDimValue}", + "%{volumeNameDimValue}", "%{thresholdPercentageDimValue}" ], "min": %{pvResourceUtilizationPercentage}, diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index b8104212d..12d462e44 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -186,6 +186,7 @@ def zeroFillMetricRecords(records, batch_time) pvZeroFillDims = {} pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = Constants::VOLUME_NAME_ZERO_FILL pvResourceUtilMetricRecord = getPVResourceUtilMetricRecords(batch_time, Constants::PV_USED_BYTES, @@hostName, @@ -289,6 +290,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen pvcNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] + volumeName = dims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { timestamp: recordTimeStamp, @@ -296,6 +298,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen podNameDimValue: podName, computerNameDimValue: computer, namespaceDimValue: pvcNamespace, + volumeNameDimValue: volumeName, pvResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index be1a9de64..35e5f9334 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -76,6 +76,7 @@ class Constants TELEGRAF_DISK_METRICS = "container.azm.ms/disk" OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" + VOLUME_NAME_ZERO_FILL = "-" #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" From 7562a96696cb4882f8387ba405b8a0f0145b00ad Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 5 Oct 2020 13:57:01 -0700 Subject: [PATCH 032/194] Release changes for 10052020 release (#453) * Release changes for 10052020 release * remove redundant kubelet metrics as part of PR feedback --- ReleaseNotes.md | 18 ++++++++++++++++++ build/version | 6 +++--- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ kubernetes/windows/Dockerfile | 2 +- 7 files changed, 33 insertions(+), 15 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 499c99f02..e1892d083 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,24 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 10/05/2020 - +##### Version microsoft/oms:ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020 (linux) +##### Version microsoft/oms:win-ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) +##### Code change log +- Health CRD to version v1 (from v1beta1) for k8s versions >= 1.19.0 +- Collection of PV usage metrics for PVs mounted by pods (kube-system pods excluded by default)(doc-link-needed) +- Zero fill few custom metrics under a timer, also add zero filling for new PV usage metrics +- Collection of additional Kubelet metrics ('kubelet_running_pod_count','volume_manager_total_volumes','kubelet_node_config_error','process_resident_memory_bytes','process_cpu_seconds_total','kubelet_runtime_operations_total','kubelet_runtime_operations_errors_total'). This also includes updates to 'kubelet' workbook to include these new metrics +- Collection of Azure NPM (Network Policy Manager) metrics (basic & advanced. By default, NPM metrics collection is turned OFF)(doc-link-needed) +- Support log collection when docker root is changed with knode. Tracked by [this](https://github.com/Azure/AKS/issues/1373) issue +- Support for Pods in 'Terminating' state for nodelost scenarios +- Fix for reduction in telemetry for custom metrics ingestion failures +- Fix CPU capacity/limits metrics being 0 for Virtual nodes (VK) +- Add new custom metric regions (eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth) +- Enable strict SSL validation for AppInsights Ruby SDK +- Turn off custom metrics upload for unsupported cluster types +- Install CA certs from wire server for windows (in certain clouds) + ### 09/16/2020 - > Note: This agent release targetted ONLY for non-AKS clusters via Azure Monitor for containers HELM chart update ##### Version microsoft/oms:ciprod09162020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020 (linux) diff --git a/build/version b/build/version index b53b0dcfb..9587328de 100644 --- a/build/version +++ b/build/version @@ -3,10 +3,10 @@ # Build Version Information CONTAINER_BUILDVERSION_MAJOR=10 -CONTAINER_BUILDVERSION_MINOR=0 +CONTAINER_BUILDVERSION_MINOR=1 CONTAINER_BUILDVERSION_PATCH=0 -CONTAINER_BUILDVERSION_BUILDNR=5 -CONTAINER_BUILDVERSION_DATE=20200916 +CONTAINER_BUILDVERSION_BUILDNR=0 +CONTAINER_BUILDVERSION_DATE=20201005 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 1d3fed86f..6d45b05d8 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.7.6 +version: 2.7.7 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 2711cb372..f841dc5d7 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,10 +12,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod09252020" - tagWindows: "win-ciprod09252020" + tag: "ciprod10052020" + tagWindows: "win-ciprod10052020" pullPolicy: IfNotPresent - dockerProviderVersion: "10.0.0-6" + dockerProviderVersion: "10.1.0-0" agentVersion: "1.10.0.1" ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index ee35cd556..f4324a18a 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod09162020 +ARG IMAGE_TAG=ciprod10052020 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index e8352e020..18bc203d4 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -337,13 +337,13 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-5" + dockerProviderVersion: "10.1.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020" imagePullPolicy: IfNotPresent resources: limits: @@ -494,13 +494,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-5" + dockerProviderVersion: "10.1.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod09162020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020" imagePullPolicy: IfNotPresent resources: limits: @@ -640,13 +640,13 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.0.0-5" + dockerProviderVersion: "10.1.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod09162020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index ca89d1c80..c7dee60af 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod09162020 +ARG IMAGE_TAG=win-ciprod10052020 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From 4b47f44491a77d7321cbbba6e5d2941326b06159 Mon Sep 17 00:00:00 2001 From: saaror <31900410+saaror@users.noreply.github.com> Date: Mon, 12 Oct 2020 16:49:16 -0700 Subject: [PATCH 033/194] Update onboarding_instructions.md (#456) * Update onboarding_instructions.md Updated the documentation to reflect where to update the config map. * Update onboarding_instructions.md * Update onboarding_instructions.md * Update onboarding_instructions.md Updated the link --- Health/onboarding_instructions.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/Health/onboarding_instructions.md b/Health/onboarding_instructions.md index 9c07b2167..4c83577b5 100644 --- a/Health/onboarding_instructions.md +++ b/Health/onboarding_instructions.md @@ -6,12 +6,28 @@ For on-boarding to Health(Tab), you would need to complete two steps ## Configure agent through ConfigMap -1. Include the following section in ConfigMap yaml file -```cmd:agent-settings: |- - [agent_settings.health_model] +1. If you are configuring your existing ConfigMap, append the following section in your existing ConfigMap yaml file +``` +#Append this section in your existing configmap +agent-settings: |- + # agent health model feature settings + [agent_settings.health_model] + # In the absence of this configmap, default value for enabled is false + enabled = true +``` +2. Else if you don't have ConfigMap, download the new ConfigMap from [here.](https://github.com/microsoft/Docker-Provider/blob/ci_prod/kubernetes/container-azm-ms-agentconfig.yaml) & then set `enabled =true` + +``` +#For new downloaded configmap enabled this default setting to true +agent-settings: |- + # agent health model feature settings + [agent_settings.health_model] + # In the absence of this configmap, default value for enabled is false enabled = true ``` -2. Run the following kubectl command: + + +3. Run the following kubectl command: `kubectl apply -f ` Example: `kubectl apply -f container-azm-ms-agentconfig.yaml`. From 3f86b23523da9082e1a36faec00af992994622cb Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 19 Oct 2020 12:42:22 -0700 Subject: [PATCH 034/194] chart update for sept2020 release (#457) --- scripts/onboarding/managed/enable-monitoring.ps1 | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 1e1669400..4815dc958 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -60,7 +60,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.7.6" +$mcrChartVersion = "2.7.7" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index ce62a581a..d7edf49dc 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -42,7 +42,7 @@ set -o pipefail defaultAzureCloud="AzureCloud" # released chart version in mcr -mcrChartVersion="2.7.6" +mcrChartVersion="2.7.7" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." From 6203c3a0dd3a1deafd39aaa18e08968f01f45ab8 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 19 Oct 2020 16:58:12 -0700 Subject: [PATCH 035/194] add missing version update in the script (#458) --- scripts/onboarding/managed/upgrade-monitoring.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 8a12b2f02..23594c7bc 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.7.6" +mcrChartVersion="2.7.7" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" From 5b154691ba558c1257e15879b3a6f34655a3fc45 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 27 Oct 2020 12:54:03 -0700 Subject: [PATCH 036/194] November release fixes - activate one agent, adx schema v2, win perf issue, syslog deactivation (#459) * activate one agent, adx schema v2, win perf issue, syslog deactivation * update chart --- .../linux/installer/scripts/livenessprobe.sh | 14 +- .../templates/omsagent-daemonset-windows.yaml | 3 +- .../templates/omsagent-daemonset.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +- kubernetes/linux/main.sh | 123 ++++++++++++++---- kubernetes/linux/setup.sh | 4 +- kubernetes/omsagent.yaml | 10 +- source/plugins/go/src/oms.go | 62 +++++---- source/plugins/go/src/utils.go | 2 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 4 +- source/plugins/ruby/constants.rb | 2 + 11 files changed, 165 insertions(+), 67 deletions(-) diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 87f68a560..e3f9fb475 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -4,15 +4,25 @@ (ps -ef | grep omsagent- | grep -v "grep") if [ $? -ne 0 ] then - echo "Agent is NOT running" > /dev/termination-log + echo " omsagent is not running" > /dev/termination-log exit 1 fi +#optionally test to exit non zero value if oneagent is not running +if [ -e "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" ]; then + (ps -ef | grep "mdsd -l" | grep -v "grep") + if [ $? -ne 0 ] + then + echo "oneagent is not running" > /dev/termination-log + exit 1 + fi +fi + #test to exit non zero value if fluentbit is not running (ps -ef | grep td-agent-bit | grep -v "grep") if [ $? -ne 0 ] then - echo "Fluentbit is NOT running" > /dev/termination-log + echo "Fluentbit is not running" > /dev/termination-log exit 1 fi diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index e65f9a98d..c916fadf6 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -46,7 +46,7 @@ spec: {{- end }} imagePullPolicy: IfNotPresent resources: -{{ toYaml .Values.omsagent.resources.daemonset | indent 9 }} +{{ toYaml .Values.omsagent.resources.daemonset-windows | indent 9 }} env: {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID @@ -96,6 +96,7 @@ spec: - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd periodSeconds: 60 initialDelaySeconds: 180 + timeoutSeconds: 15 {{- with .Values.omsagent.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 438294ce5..8af13b6ee 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -40,7 +40,7 @@ spec: {{- end }} imagePullPolicy: IfNotPresent resources: -{{ toYaml .Values.omsagent.resources.daemonset | indent 9 }} +{{ toYaml .Values.omsagent.resources.daemonset-linux | indent 9 }} env: {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index f841dc5d7..fa01c05bd 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -116,13 +116,17 @@ omsagent: ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: - daemonset: + daemonset-linux: requests: cpu: 75m memory: 225Mi limits: cpu: 150m memory: 600Mi + daemonset-windows: + limits: + cpu: 200m + memory: 600Mi deployment: requests: cpu: 150m diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 11972f0f4..b093eb74b 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -416,6 +416,97 @@ echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +#region check to auto-activate oneagent, to route container logs, +#Intent is to activate one agent routing for all managed clusters with region in the regionllist, unless overridden by configmap +# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map +# AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE will have the final route that we compute & set, based on our region list logic +echo "************start oneagent log routing checks************" +# by default, use configmap route for safer side +AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE + +#trim region list +oneagentregions="$(echo $AZMON_CONTAINERLOGS_ONEAGENT_REGIONS | xargs)" +#lowercase region list +typeset -l oneagentregions=$oneagentregions +echo "oneagent regions: $oneagentregions" +#trim current region +currentregion="$(echo $AKS_REGION | xargs)" +#lowercase current region +typeset -l currentregion=$currentregion +echo "current region: $currentregion" + +#initilze isoneagentregion as false +isoneagentregion=false + +#set isoneagentregion as true if matching region is found +if [ ! -z $oneagentregions ] && [ ! -z $currentregion ]; then + for rgn in $(echo $oneagentregions | sed "s/,/ /g"); do + if [ "$rgn" == "$currentregion" ]; then + isoneagentregion=true + echo "current region is in oneagent regions..." + break + fi + done +else + echo "current region is not in oneagent regions..." +fi + +if [ "$isoneagentregion" = true ]; then + #if configmap has a routing for logs, but current region is in the oneagent region list, take the configmap route + if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then + AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE + echo "oneagent region is true for current region:$currentregion and config map logs route is not empty. so using config map logs route as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" + else #there is no configmap route, so route thru oneagent + AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE="v2" + echo "oneagent region is true for current region:$currentregion and config map logs route is empty. so using oneagent as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" + fi +else + echo "oneagent region is false for current region:$currentregion" +fi + + +#start oneagent +if [ ! -e "/etc/config/kube.conf" ]; then + if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then + echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE" + echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" + #trim + containerlogsroute="$(echo $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE | xargs)" + # convert to lowercase + typeset -l containerlogsroute=$containerlogsroute + + echo "setting AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE as :$containerlogsroute" + export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute + echo "export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute" >> ~/.bashrc + source ~/.bashrc + + if [ "$containerlogsroute" == "v2" ]; then + echo "activating oneagent..." + echo "configuring mdsd..." + cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc + done + source /etc/mdsd.d/envmdsd + + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + export CIWORKSPACE_id=$CIWORKSPACE_id + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc + export CIWORKSPACE_key=$CIWORKSPACE_key + echo "export CIWORKSPACE_key=$CIWORKSPACE_key" >> ~/.bashrc + + source ~/.bashrc + + dpkg -l | grep mdsd | awk '{print $2 " " $3}' + + echo "starting mdsd ..." + mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + + touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 + fi + fi +fi +echo "************end oneagent log routing checks************" + #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then if [ "$CONTAINER_RUNTIME" == "docker" ]; then @@ -491,37 +582,13 @@ dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' #dpkg -l | grep telegraf | awk '{print $2 " " $3}' -#start oneagent -if [ ! -e "/etc/config/kube.conf" ]; then - if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then - echo "container logs route is defined as $AZMON_CONTAINER_LOGS_ROUTE" - #trim - containerlogsroute="$(echo $AZMON_CONTAINER_LOGS_ROUTE | xargs)" - # convert to lowercase - typeset -l containerlogsroute=$containerlogsroute - if [ "$containerlogsroute" == "v2" ]; then - echo "containerlogsroute $containerlogsroute" - echo "configuring mdsd..." - cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc - done - source /etc/mdsd.d/envmdsd - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" - export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc - export CIWORKSPACE_key=$CIWORKSPACE_key - echo "export CIWORKSPACE_key=$CIWORKSPACE_key" >> ~/.bashrc - source ~/.bashrc +echo "stopping rsyslog..." +service rsyslog stop - dpkg -l | grep mdsd | awk '{print $2 " " $3}' - - echo "starting mdsd ..." - mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - fi - fi -fi +echo "getting rsyslog status..." +service rsyslog status shutdown() { /opt/microsoft/omsagent/bin/service_control stop diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 67a981dfa..fb41d4782 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -31,8 +31,8 @@ mv $TMPDIR/omsbundle* $TMPDIR/omsbundle /usr/bin/dpkg -i $TMPDIR/omsbundle/110/omsagent*.deb #/usr/bin/dpkg -i $TMPDIR/omsbundle/100/omsconfig*.deb -#install oneagent - Latest dev bits (7/17) -wget https://github.com/microsoft/Docker-Provider/releases/download/7172020-oneagent/azure-mdsd_1.5.124-build.develop.1294_x86_64.deb +#install oneagent - Official bits (10/18) +wget https://github.com/microsoft/Docker-Provider/releases/download/10182020-oneagent/azure-mdsd_1.5.126-build.master.99_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d cp -f $TMPDIR/envmdsd /etc/mdsd.d diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 18bc203d4..61f89b808 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -347,7 +347,7 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 150m + cpu: 250m memory: 600Mi requests: cpu: 75m @@ -370,6 +370,8 @@ spec: # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" + - name: AZMON_CONTAINERLOGS_ONEAGENT_REGIONS + value: "koreacentral,norwayeast" securityContext: privileged: true ports: @@ -650,11 +652,8 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 150m + cpu: 200m memory: 600Mi - requests: - cpu: 75m - memory: 225Mi env: # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID @@ -696,6 +695,7 @@ spec: - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd periodSeconds: 60 initialDelaySeconds: 180 + timeoutSeconds: 15 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 63ca6de10..5a678781c 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -194,15 +194,15 @@ type DataItem struct { } type DataItemADX struct { - LogEntry string `json:"LogEntry"` - LogEntrySource string `json:"LogEntrySource"` - LogEntryTimeStamp string `json:"LogEntryTimeStamp"` - LogEntryTimeOfCommand string `json:"TimeOfCommand"` - ID string `json:"Id"` - Image string `json:"Image"` - Name string `json:"Name"` - SourceSystem string `json:"SourceSystem"` + TimeGenerated string `json:"TimeGenerated"` Computer string `json:"Computer"` + ContainerID string `json:"ContainerID"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` + //PodLabels string `json:"PodLabels"` AzureResourceId string `json:"AzureResourceId"` } @@ -422,7 +422,7 @@ func convert(in interface{}) (float64, bool) { func populateKubeMonAgentEventHash(record map[interface{}]interface{}, errType KubeMonAgentEventType) { var logRecordString = ToString(record["log"]) var eventTimeStamp = ToString(record["time"]) - containerID, _, podName := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) + containerID, _, podName, _ := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) Log("Locked EventHashUpdateMutex for updating hash \n ") EventHashUpdateMutex.Lock() @@ -816,7 +816,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataUpdateMutex.Unlock() for _, record := range tailPluginRecords { - containerID, k8sNamespace, _ := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) + containerID, k8sNamespace, k8sPodName, containerName := GetContainerIDK8sNamespacePodNameFromFileName(ToString(record["filepath"])) logEntrySource := ToString(record["stream"]) if strings.EqualFold(logEntrySource, "stdout") { @@ -867,16 +867,18 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if ResourceCentric == true { stringMap["AzureResourceId"] = ResourceID } + stringMap["PodName"] = k8sPodName + stringMap["PodNamespace"] = k8sNamespace + stringMap["ContainerName"] = containerName dataItemADX = DataItemADX{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - LogEntryTimeOfCommand: stringMap["TimeOfCommand"], - SourceSystem: stringMap["SourceSystem"], + TimeGenerated: stringMap["LogEntryTimeStamp"], Computer: stringMap["Computer"], - Image: stringMap["Image"], - Name: stringMap["Name"], + ContainerID: stringMap["Id"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogEntry"], + LogSource: stringMap["LogEntrySource"], AzureResourceId: stringMap["AzureResourceId"], } //ADX @@ -1018,7 +1020,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { //ADXFlushMutex.Lock() //defer ADXFlushMutex.Unlock() //MultiJSON support is not there yet - if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogMapping", ingest.JSON), ingest.FileFormat(ingest.JSON), ingest.FlushImmediately()); ingestionErr != nil { + if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogv2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { Log("Error when streaming to ADX Ingestion: %s", ingestionErr.Error()) //ADXIngestor = nil //not required as per ADX team. Will keep it to indicate that we tried this approach @@ -1107,12 +1109,13 @@ func containsKey(currentMap map[string]bool, key string) bool { return c } -// GetContainerIDK8sNamespacePodNameFromFileName Gets the container ID, k8s namespace and pod name From the file Name +// GetContainerIDK8sNamespacePodNameFromFileName Gets the container ID, k8s namespace, pod name and containername From the file Name // sample filename kube-proxy-dgcx7_kube-system_kube-proxy-8df7e49e9028b60b5b0d0547f409c455a9567946cf763267b7e6fa053ab8c182.log -func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, string, string) { +func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, string, string, string) { id := "" ns := "" podName := "" + containerName := "" start := strings.LastIndex(filename, "-") end := strings.LastIndex(filename, ".") @@ -1132,6 +1135,15 @@ func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, str ns = filename[start+1 : end] } + start = strings.LastIndex(filename, "_") + end = strings.LastIndex(filename, "-") + + if start >= end || start == -1 || end == -1 { + containerName = "" + } else { + containerName = filename[start+1 : end] + } + start = strings.Index(filename, "/containers/") end = strings.Index(filename, "_") @@ -1141,7 +1153,7 @@ func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, str podName = filename[(start + len("/containers/")):end] } - return id, ns, podName + return id, ns, podName, containerName } // InitializePlugin reads and populates plugin configuration @@ -1313,8 +1325,8 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateHTTPClient() - ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_ROUTE"))) - Log("AZMON_CONTAINER_LOGS_ROUTE:%s", ContainerLogsRoute) + ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE"))) + Log("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE:%s", ContainerLogsRoute) ContainerLogsRouteV2 = false //default is ODS ContainerLogsRouteADX = false //default is LA @@ -1365,7 +1377,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() - if enrichContainerLogs == true { + if enrichContainerLogs == true && ContainerLogsRouteADX != true { Log("ContainerLogEnrichment=true; starting goroutine to update containerimagenamemaps \n") go updateContainerImageNameMaps() } else { diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 8b1a3df65..91791ae1a 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -145,7 +145,7 @@ func CreateADXClient() { //log.Fatalf("Unable to create ADX connection %s", err.Error()) } else { Log("Successfully created ADX Client. Creating Ingestor...") - ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLog") + ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogv2") if ingestorErr != nil { Log("Error::mdsd::Unable to create ADX ingestor %s", ingestorErr.Error()) } else { diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 9e0935480..67bd61667 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -248,7 +248,9 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["dsPromUrl"] = @dsPromUrlCount end #telemetry about containerlogs Routing for daemonset - if (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) + if File.exist?(Constants::AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME) + telemetryProps["containerLogsRoute"] = "v2" + elsif (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) telemetryProps["containerLogsRoute"] = @containerLogsRoute end #telemetry about health model diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 35e5f9334..0e5099c5e 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -91,4 +91,6 @@ class Constants #Pod Statuses POD_STATUS_TERMINATING = "Terminating" + + AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME = "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" end From 157ba201f426a0f53193a9eb26a6ad650edc9442 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 27 Oct 2020 20:17:03 -0700 Subject: [PATCH 037/194] remove hiphen for params in chart (#462) Merging as its a simple fix (remove hiphen) --- .../templates/omsagent-daemonset-windows.yaml | 2 +- .../azuremonitor-containers/templates/omsagent-daemonset.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index c916fadf6..6a309c121 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -46,7 +46,7 @@ spec: {{- end }} imagePullPolicy: IfNotPresent resources: -{{ toYaml .Values.omsagent.resources.daemonset-windows | indent 9 }} +{{ toYaml .Values.omsagent.resources.daemonsetwindows | indent 9 }} env: {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 8af13b6ee..d57c4d82b 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -40,7 +40,7 @@ spec: {{- end }} imagePullPolicy: IfNotPresent resources: -{{ toYaml .Values.omsagent.resources.daemonset-linux | indent 9 }} +{{ toYaml .Values.omsagent.resources.daemonsetlinux | indent 9 }} env: {{- if ne .Values.omsagent.env.clusterId "" }} - name: AKS_RESOURCE_ID diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index fa01c05bd..774e6203f 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -116,14 +116,14 @@ omsagent: ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: - daemonset-linux: + daemonsetlinux: requests: cpu: 75m memory: 225Mi limits: cpu: 150m memory: 600Mi - daemonset-windows: + daemonsetwindows: limits: cpu: 200m memory: 600Mi From 7c448bc5f561b2a72c33c689dda0db893bd41038 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 27 Oct 2020 21:22:34 -0700 Subject: [PATCH 038/194] Changes for cutting a new build for ciprod10272020 release (#460) --- ReleaseNotes.md | 10 ++++++++++ build/version | 6 +++--- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ kubernetes/windows/Dockerfile | 2 +- scripts/onboarding/managed/enable-monitoring.ps1 | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- scripts/onboarding/managed/upgrade-monitoring.sh | 2 +- 10 files changed, 28 insertions(+), 18 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index e1892d083..eb8e282b9 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,16 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 10/27/2020 - +##### Version microsoft/oms:ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020 (linux) +##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) +##### Code change log +- Activate oneagent in few AKS regions (koreacentral,norwayeast) +- Disable syslog +- Fix timeout for Windows daemonset liveness probe +- Make request == limit for Windows daemonset resources (cpu & memory) +- Schema v2 for container log (ADX only - applicable only for select customers for piloting) + ### 10/05/2020 - ##### Version microsoft/oms:ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020 (linux) ##### Version microsoft/oms:win-ciprod10052020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) diff --git a/build/version b/build/version index 9587328de..71c70020e 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=10 -CONTAINER_BUILDVERSION_MINOR=1 +CONTAINER_BUILDVERSION_MAJOR=11 +CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20201005 +CONTAINER_BUILDVERSION_DATE=20201027 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 6d45b05d8..bc35690e4 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.7.7 +version: 2.7.8 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 774e6203f..0f07a98c1 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,10 +12,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod10052020" - tagWindows: "win-ciprod10052020" + tag: "ciprod10272020" + tagWindows: "win-ciprod10272020" pullPolicy: IfNotPresent - dockerProviderVersion: "10.1.0-0" + dockerProviderVersion: "11.0.0-0" agentVersion: "1.10.0.1" ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index f4324a18a..c3428a44a 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod10052020 +ARG IMAGE_TAG=ciprod10272020 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 61f89b808..ca47d898d 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -337,13 +337,13 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.1.0-0" + dockerProviderVersion: "11.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020" imagePullPolicy: IfNotPresent resources: limits: @@ -496,13 +496,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.1.0-0" + dockerProviderVersion: "11.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10052020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020" imagePullPolicy: IfNotPresent resources: limits: @@ -642,13 +642,13 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "10.1.0-0" + dockerProviderVersion: "11.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10272020" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index c7dee60af..414817559 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod10052020 +ARG IMAGE_TAG=win-ciprod10272020 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 4815dc958..22d34894f 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -60,7 +60,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.7.7" +$mcrChartVersion = "2.7.8" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index d7edf49dc..e0d26c370 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -42,7 +42,7 @@ set -o pipefail defaultAzureCloud="AzureCloud" # released chart version in mcr -mcrChartVersion="2.7.7" +mcrChartVersion="2.7.8" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 23594c7bc..4134d710f 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.7.7" +mcrChartVersion="2.7.8" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" From 62b27d79ba9622a939b6d20e33292725bb2e9bef Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 29 Oct 2020 08:18:07 -0700 Subject: [PATCH 039/194] using latest stable version of msys2 (#465) --- kubernetes/windows/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 414817559..c4545d705 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -10,7 +10,7 @@ ARG IMAGE_TAG=win-ciprod10272020 RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" # Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ -&& choco install -y msys2 --version 20190524.0.0.20191030 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y msys2 --version 20200903.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ && choco install -y vim # gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update From 909cc16348135c31f8d82af130a75f8bc54f7b6f Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 29 Oct 2020 14:48:00 -0700 Subject: [PATCH 040/194] fixing the windows-perf-dups (#466) --- source/plugins/ruby/in_win_cadvisor_perf.rb | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 4e90195e5..9c267cf4f 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -10,7 +10,7 @@ class Win_CAdvisor_Perf_Input < Input def initialize super require "yaml" - require 'yajl/json_gem' + require "yajl/json_gem" require "time" require_relative "CAdvisorMetricsAPIClient" @@ -52,8 +52,6 @@ def shutdown def enumerate() time = Time.now.to_f begin - eventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@istestvar = ENV["ISTEST"] @@ -70,6 +68,7 @@ def enumerate() @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end @@winNodes.each do |winNode| + eventStream = MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601) metricData.each do |record| if !record.empty? @@ -81,7 +80,6 @@ def enumerate() router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -90,6 +88,7 @@ def enumerate() begin containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) + insightsMetricsEventStream = MultiEventStream.new containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| wrapper = { @@ -104,12 +103,12 @@ def enumerate() router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end + end rescue => errorStr $log.warn "Failed when processing GPU Usage metrics in_win_cadvisor_perf : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + end #end GPU InsightsMetrics items end From d481c066df67ce9cf76d163c0776502f3989aea1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 6 Nov 2020 00:02:52 -0800 Subject: [PATCH 041/194] chart updates related to new microsoft/charts repo (#467) --- charts/azuremonitor-containers/README.md | 18 ++++++++++-------- .../templates/NOTES.txt | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/charts/azuremonitor-containers/README.md b/charts/azuremonitor-containers/README.md index 3b357ffd5..469fac94a 100644 --- a/charts/azuremonitor-containers/README.md +++ b/charts/azuremonitor-containers/README.md @@ -29,6 +29,8 @@ Monitoring your Kubernetes cluster and containers is critical, especially when r ## Installing the Chart +> Note: If you want to customize the chart, fork the chart code in https://github.com/microsoft/Docker-Provider/tree/ci_prod/charts/azuremonitor-containers + > Note: `--name` flag not required in Helm3 since this flag is deprecated > Note: use `omsagent.proxy` parameter to set the proxy endpoint if your K8s cluster configured behind the proxy. Refer to [configure proxy](#Configuring-Proxy-Endpoint) for more details about proxy. @@ -36,25 +38,25 @@ Monitoring your Kubernetes cluster and containers is critical, especially when r ### To Use Azure Log Analytics Workspace in Public Cloud ```bash -$ helm repo add incubator https://kubernetes-charts-incubator.storage.googleapis.com/ +$ helm repo add microsoft https://microsoft.github.io/charts/repo $ helm install --name azmon-containers-release-1 \ ---set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= incubator/azuremonitor-containers +--set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers ``` ### To Use Azure Log Analytics Workspace in Azure China Cloud ```bash -$ helm repo add incubator https://kubernetes-charts-incubator.storage.googleapis.com/ +$ helm repo add microsoft https://microsoft.github.io/charts/repo $ helm install --name azmon-containers-release-1 \ ---set omsagent.domain=opinsights.azure.cn,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= incubator/azuremonitor-containers +--set omsagent.domain=opinsights.azure.cn,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers ``` ### To Use Azure Log Analytics Workspace in Azure US Government Cloud ```bash -$ helm repo add incubator https://kubernetes-charts-incubator.storage.googleapis.com/ +$ helm repo add microsoft https://microsoft.github.io/charts/repo $ helm install --name azmon-containers-release-1 \ ---set omsagent.domain=opinsights.azure.us,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= incubator/azuremonitor-containers +--set omsagent.domain=opinsights.azure.us,omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= microsoft/azuremonitor-containers ``` ## Upgrading an existing Release to a new version @@ -112,13 +114,13 @@ Specify each parameter using the `--set key=value[,key=value]` argument to `helm $ helm install --name myrelease-1 \ --set omsagent.secret.wsid=,omsagent.secret.key=,omsagent.env.clusterName= - incubator/azuremonitor-containers + microsoft/azuremonitor-containers ``` Alternatively, a YAML file that specifies the values for the parameters can be provided while installing the chart. For example, ```bash -$ helm install --name myrelease-1 -f values.yaml incubator/azuremonitor-containers +$ helm install --name myrelease-1 -f values.yaml microsoft/azuremonitor-containers ``` diff --git a/charts/azuremonitor-containers/templates/NOTES.txt b/charts/azuremonitor-containers/templates/NOTES.txt index 372cecb95..48ebf33fc 100644 --- a/charts/azuremonitor-containers/templates/NOTES.txt +++ b/charts/azuremonitor-containers/templates/NOTES.txt @@ -29,7 +29,7 @@ This deployment will not complete. To proceed, run --set omsagent.secret.wsid= \ --set omsagent.secret.key= \ --set omsagent.env.clusterName= \ - incubator/azuremonitor-containers + microsoft/azuremonitor-containers {{- else -}} From aff1e13c240836cea73f3913f098b2737f186b89 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 9 Nov 2020 13:18:02 -0800 Subject: [PATCH 042/194] Changes for creating 11092020 release (#468) --- ReleaseNotes.md | 6 ++++++ build/version | 4 ++-- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ kubernetes/windows/Dockerfile | 2 +- scripts/onboarding/managed/enable-monitoring.ps1 | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- scripts/onboarding/managed/upgrade-monitoring.sh | 2 +- 10 files changed, 23 insertions(+), 17 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index eb8e282b9..ddfd01314 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,12 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 11/09/2020 - +##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020 (linux) +##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020 (windows) +##### Code change log +- Fix for duplicate windows metrics + ### 10/27/2020 - ##### Version microsoft/oms:ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020 (linux) ##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) diff --git a/build/version b/build/version index 71c70020e..a8b78ecac 100644 --- a/build/version +++ b/build/version @@ -5,8 +5,8 @@ CONTAINER_BUILDVERSION_MAJOR=11 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 -CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20201027 +CONTAINER_BUILDVERSION_BUILDNR=1 +CONTAINER_BUILDVERSION_DATE=20201109 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index bc35690e4..987841f77 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.7.8 +version: 2.7.9 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 0f07a98c1..76ea0a26d 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,10 +12,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod10272020" - tagWindows: "win-ciprod10272020" + tag: "ciprod11092020" + tagWindows: "win-ciprod11092020" pullPolicy: IfNotPresent - dockerProviderVersion: "11.0.0-0" + dockerProviderVersion: "11.0.0-1" agentVersion: "1.10.0.1" ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index c3428a44a..d04e86128 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod10272020 +ARG IMAGE_TAG=ciprod11092020 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index ca47d898d..7d07eafcd 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -337,13 +337,13 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "11.0.0-0" + dockerProviderVersion: "11.0.0-1" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" imagePullPolicy: IfNotPresent resources: limits: @@ -496,13 +496,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "11.0.0-0" + dockerProviderVersion: "11.0.0-1" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" imagePullPolicy: IfNotPresent resources: limits: @@ -642,13 +642,13 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "11.0.0-0" + dockerProviderVersion: "11.0.0-1" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10272020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index c4545d705..10ea235b2 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod10272020 +ARG IMAGE_TAG=win-ciprod11092020 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 22d34894f..b052f22c5 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -60,7 +60,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.7.8" +$mcrChartVersion = "2.7.9" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index e0d26c370..bb6974258 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -42,7 +42,7 @@ set -o pipefail defaultAzureCloud="AzureCloud" # released chart version in mcr -mcrChartVersion="2.7.8" +mcrChartVersion="2.7.9" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 4134d710f..11ecf6819 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.7.8" +mcrChartVersion="2.7.9" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" From ca18850046fd54f7830bbe2addb51039928c3514 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 9 Nov 2020 18:47:36 -0800 Subject: [PATCH 043/194] MDM exception aggregation (#470) --- source/plugins/ruby/constants.rb | 112 ++++++++++++++++--------------- source/plugins/ruby/out_mdm.rb | 51 ++++++++++++-- 2 files changed, 104 insertions(+), 59 deletions(-) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 0e5099c5e..079584c7b 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -1,61 +1,61 @@ # frozen_string_literal: true class Constants - INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms" - INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId" - INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName" - INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor" - INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu" - INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel" - INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId" - INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName" - INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName" - INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" - INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" - INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" - INSIGHTSMETRICS_TAGS_POD_UID = "podUid" - INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" - INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" - INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" - INSIGHTSMETRICS_TAGS_POD_NAME = "podName" - INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" - INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" - INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" - REASON_OOM_KILLED = "oomkilled" - #Kubestate (common) - INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate" - INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime" - #Kubestate (deployments) - INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated" - INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available" - #Kubestate (HPA) - INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName" - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas" - - INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime" - # MDM Metric names - MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount" - MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount" - MDM_POD_READY_PERCENTAGE = "podReadyPercentage" - MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount" - MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage" - MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" - MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" - MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" - MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" - MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" - MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" - MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" + INSIGHTSMETRICS_TAGS_ORIGIN = "container.azm.ms" + INSIGHTSMETRICS_TAGS_CLUSTERID = "container.azm.ms/clusterId" + INSIGHTSMETRICS_TAGS_CLUSTERNAME = "container.azm.ms/clusterName" + INSIGHTSMETRICS_TAGS_GPU_VENDOR = "gpuVendor" + INSIGHTSMETRICS_TAGS_GPU_NAMESPACE = "container.azm.ms/gpu" + INSIGHTSMETRICS_TAGS_GPU_MODEL = "gpuModel" + INSIGHTSMETRICS_TAGS_GPU_ID = "gpuId" + INSIGHTSMETRICS_TAGS_CONTAINER_NAME = "containerName" + INSIGHTSMETRICS_TAGS_CONTAINER_ID = "containerName" + INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" + INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" + INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_POD_UID = "podUid" + INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" + INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" + INSIGHTSMETRICS_TAGS_POD_NAME = "podName" + INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" + INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" + INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" + REASON_OOM_KILLED = "oomkilled" + #Kubestate (common) + INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE = "container.azm.ms/kubestate" + INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME = "creationTime" + #Kubestate (deployments) + INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE = "kube_deployment_status_replicas_ready" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME = "deployment" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_CREATIONTIME = "creationTime" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY = "deploymentStrategy" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS = "spec_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED = "status_replicas_updated" + INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE = "status_replicas_available" + #Kubestate (HPA) + INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE = "kube_hpa_status_current_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME = "hpa" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS = "spec_max_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS = "spec_min_replicas" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND = "targetKind" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME = "targetName" + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS = "status_desired_replicas" + + INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME = "lastScaleTime" + # MDM Metric names + MDM_OOM_KILLED_CONTAINER_COUNT = "oomKilledContainerCount" + MDM_CONTAINER_RESTART_COUNT = "restartingContainerCount" + MDM_POD_READY_PERCENTAGE = "podReadyPercentage" + MDM_STALE_COMPLETED_JOB_COUNT = "completedJobsCount" + MDM_DISK_USED_PERCENTAGE = "diskUsedPercentage" + MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" + MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" + MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" + MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" + MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" + MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" + MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 OBJECT_NAME_K8S_CONTAINER = "K8SContainer" @@ -88,6 +88,8 @@ class Constants KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" + MDM_EXCEPTION_TELEMETRY_METRIC = "AKSCustomMetricsMdmExceptions" + MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL = 30 #Pod Statuses POD_STATUS_TERMINATING = "Terminating" diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 1c805255a..6238eb51a 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -50,6 +50,10 @@ def initialize @cluster_identity = nil @isArcK8sCluster = false @get_access_token_backoff_expiry = Time.now + + @mdm_exceptions_hash = {} + @mdm_exceptions_count = 0 + @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i end def configure(conf) @@ -221,10 +225,49 @@ def format(tag, time, record) end end + def exception_aggregator(error) + begin + errorStr = error.to_s + if (@mdm_exceptions_hash[errorStr].nil?) + @mdm_exceptions_hash[errorStr] = 1 + else + @mdm_exceptions_hash[errorStr] += 1 + end + #Keeping track of all exceptions to send the total in the last flush interval as a metric + @mdm_exceptions_count += 1 + rescue => error + @log.info "Error in MDM exception_aggregator method: #{error}" + ApplicationInsightsUtility.sendExceptionTelemetry(error) + end + end + + def flush_mdm_exception_telemetry + begin + #Flush out exception telemetry as a metric for the last 30 minutes + timeDifference = (DateTime.now.to_time.to_i - @mdm_exception_telemetry_time_tracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL) + telemetryProperties = {} + telemetryProperties["ExceptionsHashForFlushInterval"] = @mdm_exceptions_hash.to_json + telemetryProperties["FlushInterval"] = Constants::MDM_EXCEPTIONS_METRIC_FLUSH_INTERVAL + ApplicationInsightsUtility.sendMetricTelemetry(Constants::MDM_EXCEPTION_TELEMETRY_METRIC, @mdm_exceptions_count, telemetryProperties) + # Resetting values after flushing + @mdm_exceptions_count = 0 + @mdm_exceptions_hash = {} + @mdm_exception_telemetry_time_tracker = DateTime.now.to_time.to_i + end + rescue => error + @log.info "Error in flush_mdm_exception_telemetry method: #{error}" + ApplicationInsightsUtility.sendExceptionTelemetry(error) + end + end + # This method is called every flush interval. Send the buffer chunk to MDM. # 'chunk' is a buffer chunk that includes multiple formatted records def write(chunk) begin + # Adding this before trying to flush out metrics, since adding after can lead to metrics never being sent + flush_mdm_exception_telemetry if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm post_body = [] chunk.msgpack_each { |(tag, record)| @@ -247,7 +290,8 @@ def write(chunk) end end rescue Exception => e - ApplicationInsightsUtility.sendExceptionTelemetry(e) + # Adding exceptions to hash to aggregate and send telemetry for all write errors + exception_aggregator(e) @log.info "Exception when writing to MDM: #{e}" raise e end @@ -282,7 +326,6 @@ def send_to_mdm(post_body) else @log.info "Failed to Post Metrics to MDM : #{e} Response: #{response}" end - #@log.info "MDM request : #{post_body}" @log.debug_backtrace(e.backtrace) if !response.code.empty? && response.code == 403.to_s @log.info "Response Code #{response.code} Updating @last_post_attempt_time" @@ -297,15 +340,15 @@ def send_to_mdm(post_body) @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" raise e end + # Adding exceptions to hash to aggregate and send telemetry for all 400 error codes + exception_aggregator(e) rescue Errno::ETIMEDOUT => e @log.info "Timed out when POSTing Metrics to MDM : #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(e) raise e rescue Exception => e @log.info "Exception POSTing Metrics to MDM : #{e} Response: #{response}" @log.debug_backtrace(e.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(e) raise e end end From 18c27dda3e8af3187502f4ecfc9475dea74f3ce5 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 23 Nov 2020 08:37:38 -0800 Subject: [PATCH 044/194] grwehner/mdm custom metric regions (#471) Remove custom metrics region check for public cloud --- build/linux/installer/conf/container.conf | 2 -- build/linux/installer/conf/kube.conf | 3 --- .../templates/omsagent-rs-configmap.yaml | 3 --- kubernetes/linux/main.sh | 11 +++++++++++ kubernetes/omsagent.yaml | 3 --- kubernetes/windows/main.ps1 | 6 ++++++ .../preview/health/omsagent-template-aks-engine.yaml | 2 -- scripts/preview/health/omsagent-template.yaml | 2 -- source/plugins/ruby/CustomMetricsUtils.rb | 12 +++--------- source/plugins/ruby/filter_cadvisor2mdm.rb | 3 +-- source/plugins/ruby/filter_inventory2mdm.rb | 3 +-- source/plugins/ruby/filter_telegraf2mdm.rb | 3 +-- source/plugins/ruby/in_kube_podinventory.rb | 3 +-- source/plugins/ruby/podinventory_to_mdm.rb | 4 ++-- 14 files changed, 26 insertions(+), 34 deletions(-) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index f7e6e1da9..958a85eb6 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -45,14 +45,12 @@ #custom_metrics_mdm filter plugin type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes log_level info type filter_telegraf2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level debug diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index dbb4db0da..121472eba 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -13,7 +13,6 @@ tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth #Kubernetes events @@ -66,14 +65,12 @@ type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml index 475b17a46..e1bc969cb 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml @@ -18,7 +18,6 @@ data: tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth #Kubernetes events @@ -70,14 +69,12 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level info # custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index b093eb74b..a2ba6a1d1 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -150,6 +150,17 @@ else echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check" fi +# Set environment variable for if public cloud by checking the workspace domain. +if [ -z $domain ]; then + ClOUD_ENVIRONMENT="unknown" +elif [ $domain == "opinsights.azure.com" ]; then + CLOUD_ENVIRONMENT="public" +else + CLOUD_ENVIRONMENT="national" +fi +export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT +echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc + #Parse the configmap to set the right environment variables. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 7d07eafcd..2155361e9 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -64,7 +64,6 @@ data: tag oms.containerinsights.KubePodInventory run_interval 60 log_level debug - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth #Kubernetes events @@ -117,14 +116,12 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 2e8659601..d32e5068a 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -43,15 +43,21 @@ function Start-FileSystemWatcher { function Set-EnvironmentVariables { $domain = "opinsights.azure.com" + $cloud_environment = "public" if (Test-Path /etc/omsagent-secret/DOMAIN) { # TODO: Change to omsagent-secret before merging $domain = Get-Content /etc/omsagent-secret/DOMAIN + $cloud_environment = "national" } # Set DOMAIN [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Process") [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Machine") + # Set CLOUD_ENVIRONMENT + [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Process") + [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") + $wsID = "" if (Test-Path /etc/omsagent-secret/WSID) { # TODO: Change to omsagent-secret before merging diff --git a/scripts/preview/health/omsagent-template-aks-engine.yaml b/scripts/preview/health/omsagent-template-aks-engine.yaml index 5526602c0..5e063fd54 100644 --- a/scripts/preview/health/omsagent-template-aks-engine.yaml +++ b/scripts/preview/health/omsagent-template-aks-engine.yaml @@ -108,14 +108,12 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope log_level info # custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/scripts/preview/health/omsagent-template.yaml b/scripts/preview/health/omsagent-template.yaml index 6e3a52020..e58e9c33f 100644 --- a/scripts/preview/health/omsagent-template.yaml +++ b/scripts/preview/health/omsagent-template.yaml @@ -108,14 +108,12 @@ data: type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope log_level info # custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info diff --git a/source/plugins/ruby/CustomMetricsUtils.rb b/source/plugins/ruby/CustomMetricsUtils.rb index a19580630..220313e6b 100644 --- a/source/plugins/ruby/CustomMetricsUtils.rb +++ b/source/plugins/ruby/CustomMetricsUtils.rb @@ -6,21 +6,15 @@ def initialize end class << self - def check_custom_metrics_availability(custom_metric_regions) + def check_custom_metrics_availability aks_region = ENV['AKS_REGION'] aks_resource_id = ENV['AKS_RESOURCE_ID'] + aks_cloud_environment = ENV['CLOUD_ENVIRONMENT'] if aks_region.to_s.empty? || aks_resource_id.to_s.empty? return false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set end - custom_metrics_regions_arr = custom_metric_regions.split(',') - custom_metrics_regions_hash = custom_metrics_regions_arr.map {|x| [x.downcase,true]}.to_h - - if custom_metrics_regions_hash.key?(aks_region.downcase) - true - else - false - end + return aks_cloud_environment.to_s.downcase == 'public' end end end \ No newline at end of file diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 3bc674ea8..2423ad024 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -15,7 +15,6 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" - config_param :custom_metrics_azure_regions, :string config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,Constants::PV_USED_BYTES" @@hostName = (OMS::Common.get_hostname) @@ -42,7 +41,7 @@ def configure(conf) def start super begin - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability @metrics_to_collect_hash = build_metrics_hash @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i diff --git a/source/plugins/ruby/filter_inventory2mdm.rb b/source/plugins/ruby/filter_inventory2mdm.rb index b5ef587ff..38ccab885 100644 --- a/source/plugins/ruby/filter_inventory2mdm.rb +++ b/source/plugins/ruby/filter_inventory2mdm.rb @@ -13,7 +13,6 @@ class Inventory2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log' - config_param :custom_metrics_azure_regions, :string @@node_count_metric_name = 'nodesCount' @@pod_count_metric_name = 'podCount' @@ -98,7 +97,7 @@ def configure(conf) def start super - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" end diff --git a/source/plugins/ruby/filter_telegraf2mdm.rb b/source/plugins/ruby/filter_telegraf2mdm.rb index 98d258ea5..88ae428d1 100644 --- a/source/plugins/ruby/filter_telegraf2mdm.rb +++ b/source/plugins/ruby/filter_telegraf2mdm.rb @@ -15,7 +15,6 @@ class Telegraf2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log" - config_param :custom_metrics_azure_regions, :string @process_incoming_stream = true @@ -36,7 +35,7 @@ def configure(conf) def start super begin - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(@custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" rescue => errorStr @log.info "Error initializing plugin #{errorStr}" diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 4880d80e7..bba3e920f 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -36,11 +36,10 @@ def initialize config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" - config_param :custom_metrics_azure_regions, :string def configure(conf) super - @inventoryToMdmConvertor = Inventory2MdmConvertor.new(@custom_metrics_azure_regions) + @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end def start diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index 834515969..77370e284 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -80,14 +80,14 @@ class Inventory2MdmConvertor @@pod_phase_values = ["Running", "Pending", "Succeeded", "Failed", "Unknown"] @process_incoming_stream = false - def initialize(custom_metrics_azure_regions) + def initialize() @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" @log = Logger.new(@log_path, 1, 5000000) @pod_count_hash = {} @no_phase_dim_values_hash = {} @pod_count_by_phase = {} @pod_uids = {} - @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability(custom_metrics_azure_regions) + @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @log.debug { "Starting podinventory_to_mdm plugin" } end From a5c12e9a5e28dc27b8288d21bc72b5937b93e370 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 3 Dec 2020 17:20:51 -0800 Subject: [PATCH 045/194] updaitng rs limit to 1gb (#474) --- charts/azuremonitor-containers/values.yaml | 2 +- kubernetes/omsagent.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 76ea0a26d..e8acda20e 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -133,4 +133,4 @@ omsagent: memory: 250Mi limits: cpu: 1 - memory: 750Mi + memory: 1Gi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 2155361e9..296de02bf 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -504,7 +504,7 @@ spec: resources: limits: cpu: 1 - memory: 750Mi + memory: 1Gi requests: cpu: 150m memory: 250Mi From 7453fd4e3d8a918a70683a5a3a8344bd550a5349 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 10 Dec 2020 10:45:09 -0800 Subject: [PATCH 046/194] grwehner/pv inventory (#455) Add fluentd plugin to request persistent volume info from the kubernetes api and send to LA --- build/linux/installer/conf/kube.conf | 23 ++ .../installer/datafiles/base_container.data | 1 + kubernetes/omsagent.yaml | 24 ++ source/plugins/ruby/constants.rb | 4 + source/plugins/ruby/in_kube_pvinventory.rb | 253 ++++++++++++++++++ 5 files changed, 305 insertions(+) create mode 100644 source/plugins/ruby/in_kube_pvinventory.rb diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 121472eba..fb566c360 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -15,6 +15,14 @@ log_level debug + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug + + #Kubernetes events type kubeevents @@ -95,6 +103,21 @@ max_retry_wait 5m + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + type out_oms log_level debug diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index ca2538b79..ec42d5967 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -22,6 +22,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/filter_container.rb; source/plugins/ruby/filter_container.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root /opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 296de02bf..26c7ae9a0 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -21,6 +21,7 @@ rules: "nodes/proxy", "namespaces", "services", + "persistentvolumes" ] verbs: ["list", "get", "watch"] - apiGroups: ["apps", "extensions", "autoscaling"] @@ -66,6 +67,14 @@ data: log_level debug + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug + + #Kubernetes events type kubeevents @@ -146,6 +155,21 @@ data: max_retry_wait 5m + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + type out_oms log_level debug diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 079584c7b..cf41900dc 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -77,6 +77,9 @@ class Constants OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" VOLUME_NAME_ZERO_FILL = "-" + PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", + "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" @@ -84,6 +87,7 @@ class Constants CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" + PV_INVENTORY_HEART_BEAT_EVENT = "KubePVInventoryHeartBeatEvent" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 ZERO_FILL_METRICS_INTERVAL_IN_MINUTES = 30 diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb new file mode 100644 index 000000000..b0e09c85b --- /dev/null +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -0,0 +1,253 @@ +module Fluent + class Kube_PVInventory_Input < Input + Plugin.register_input("kubepvinventory", self) + + @@hostName = (OMS::Common.get_hostname) + + def initialize + super + require "yaml" + require "yajl/json_gem" + require "yajl" + require "time" + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + require_relative "constants" + + # Response size is around 1500 bytes per PV + @PV_CHUNK_SIZE = "5000" + @pvTypeToCountHash = {} + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => "oms.containerinsights.KubePVInventory" + + def configure(conf) + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + begin + pvInventory = nil + telemetryFlush = false + @pvTypeToCountHash = {} + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + continuationToken = nil + $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") + $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + parse_and_emit_records(pvInventory, batchTime) + else + $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" + end + + # If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}&continue=#{continuationToken}") + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + parse_and_emit_records(pvInventory, batchTime) + else + $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" + end + end + + # Setting this to nil so that we dont hold memory until GC kicks in + pvInventory = nil + + # Adding telemetry to send pod telemetry every 10 minutes + timeDifference = (DateTime.now.to_time.to_i - @@pvTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + telemetryFlush = true + end + + # Flush AppInsights telemetry once all the processing is done + if telemetryFlush == true + telemetryProperties = {} + telemetryProperties["CountsOfPVTypes"] = @pvTypeToCountHash + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_INVENTORY_HEART_BEAT_EVENT, telemetryProperties) + @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i + end + + rescue => errorStr + $log.warn "in_kube_pvinventory::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end # end enumerate + + def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) + currentTime = Time.now + emitTime = currentTime.to_f + eventStream = MultiEventStream.new + + begin + records = [] + pvInventory["items"].each do |item| + + # Node, pod, & usage info can be found by joining with pvUsedBytes metric using PVCNamespace/PVCName + record = {} + record["CollectionTime"] = batchTime + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["PVName"] = item["metadata"]["name"] + record["PVStatus"] = item["status"]["phase"] + record["PVAccessModes"] = item["spec"]["accessModes"].join(', ') + record["PVStorageClassName"] = item["spec"]["storageClassName"] + record["PVCapacityBytes"] = KubernetesApiClient.getMetricNumericValue("memory", item["spec"]["capacity"]["storage"]) + record["PVCreationTimeStamp"] = item["metadata"]["creationTimestamp"] + + # Optional values + pvcNamespace, pvcName = getPVCInfo(item) + type, typeInfo = getTypeInfo(item) + record["PVCNamespace"] = pvcNamespace + record["PVCName"] = pvcName + record["PVType"] = type + record["PVTypeInfo"] = typeInfo + + records.push(record) + + # Record telemetry + if type == nil + type = "empty" + end + if (@pvTypeToCountHash.has_key? type) + @pvTypeToCountHash[type] += 1 + else + @pvTypeToCountHash[type] = 1 + end + end + + records.each do |record| + if !record.nil? + wrapper = { + "DataType" => "KUBE_PV_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + end + end + + router.emit_stream(@tag, eventStream) if eventStream + + rescue => errorStr + $log.warn "Failed in parse_and_emit_record for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + + def getPVCInfo(item) + begin + if !item["spec"].nil? && !item["spec"]["claimRef"].nil? + claimRef = item["spec"]["claimRef"] + pvcNamespace = claimRef["namespace"] + pvcName = claimRef["name"] + return pvcNamespace, pvcName + end + rescue => errorStr + $log.warn "Failed in getPVCInfo for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + + # No PVC or an error + return nil, nil + end + + def getTypeInfo(item) + begin + if !item["spec"].nil? + (Constants::PV_TYPES).each do |pvType| + + # PV is this type + if !item["spec"][pvType].nil? + + # Get additional info if azure disk/file + typeInfo = {} + if pvType == "azureDisk" + azureDisk = item["spec"]["azureDisk"] + typeInfo["DiskName"] = azureDisk["diskName"] + typeInfo["DiskUri"] = azureDisk["diskURI"] + elsif pvType == "azureFile" + typeInfo["FileShareName"] = item["spec"]["azureFile"]["shareName"] + end + + # Can only have one type: return right away when found + return pvType, typeInfo + + end + end + end + rescue => errorStr + $log.warn "Failed in getTypeInfo for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + + # No matches from list of types or an error + return nil, {} + end + + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_pvinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kube_pvinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kube_pvinventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + + end # Kube_PVInventory_Input +end # module \ No newline at end of file From 24b709f9e3c3b18779102b491fc98b87a99d1335 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Dec 2020 09:42:52 -0800 Subject: [PATCH 047/194] Gangams/fix for build release pipeline issue (#476) * use isolated cdpx acr * correct comment --- .pipelines/get-aad-app-creds-from-kv.sh | 14 ++++++++++++++ ...ll-from-cdpx-and-push-to-ci-acr-linux-image.sh | 15 ++++++++++++--- ...-from-cdpx-and-push-to-ci-acr-windows-image.sh | 14 +++++++++++--- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/.pipelines/get-aad-app-creds-from-kv.sh b/.pipelines/get-aad-app-creds-from-kv.sh index 8ef56cddb..a0ba464cc 100755 --- a/.pipelines/get-aad-app-creds-from-kv.sh +++ b/.pipelines/get-aad-app-creds-from-kv.sh @@ -11,6 +11,8 @@ do KV) KV=$VALUE ;; KVSECRETNAMEAPPID) AppId=$VALUE ;; KVSECRETNAMEAPPSECRET) AppSecret=$VALUE ;; + KVSECRETNAMECDPXAPPID) CdpxAppId=$VALUE ;; + KVSECRETNAMECDPXAPPSECRET) CdpxAppSecret=$VALUE ;; *) esac done @@ -27,4 +29,16 @@ az keyvault secret download --file ~/acrappsecret --vault-name ${KV} --name ${A echo "downloaded the appsecret from KV:${KV} and KV secret:${AppSecret}" +echo "key vault secret name for cdpx appid:${KVSECRETNAMECDPXAPPID}" + +echo "key vault secret name for cdpx appsecret:${KVSECRETNAMECDPXAPPSECRET}" + +az keyvault secret download --file ~/cdpxacrappid --vault-name ${KV} --name ${CdpxAppId} + +echo "downloaded the appid from KV:${KV} and KV secret:${CdpxAppId}" + +az keyvault secret download --file ~/cdpxacrappsecret --vault-name ${KV} --name ${CdpxAppSecret} + +echo "downloaded the appsecret from KV:${KV} and KV secret:${CdpxAppSecret}" + echo "end: get app id and secret from specified key vault" diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh index 638d3a937..3844ea185 100755 --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh @@ -25,12 +25,21 @@ ACR_APP_ID=$(cat ~/acrappid) ACR_APP_SECRET=$(cat ~/acrappsecret) echo "end: read appid and appsecret" +echo "start: read appid and appsecret for cdpx" +CDPX_ACR_APP_ID=$(cat ~/cdpxacrappid) +CDPX_ACR_APP_SECRET=$(cat ~/cdpxacrappsecret) +echo "end: read appid and appsecret which has read access on cdpx acr" + + +# Name of CDPX_ACR should be in this format :Naming convention: 'cdpx' + service tree id without '-' + two digit suffix like'00'/'01 +# suffix 00 primary and 01 secondary, and we only use primary +# This configured via pipeline variable echo "login to cdpxlinux acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET +docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password $CDPX_ACR_APP_SECRET echo "login to cdpxlinux acr completed: ${CDPX_ACR}" echo "pull agent image from cdpxlinux acr: ${CDPX_ACR}" -docker pull ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} +docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} echo "pull image from cdpxlinux acr completed: ${CDPX_ACR}" echo "CI Release name is:"$CI_RELEASE @@ -41,7 +50,7 @@ echo "CI ACR : ${CI_ACR}" echo "CI AGENT REPOSITORY NAME : ${CI_AGENT_REPO}" echo "tag linux agent image" -docker tag ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} echo "login ciprod acr":$CI_ACR docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh index 066410af5..095a00039 100755 --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh @@ -25,12 +25,20 @@ ACR_APP_ID=$(cat ~/acrappid ) ACR_APP_SECRET=$(cat ~/acrappsecret) echo "end: read appid and appsecret" +echo "start: read appid and appsecret for cdpx" +CDPX_ACR_APP_ID=$(cat ~/cdpxacrappid) +CDPX_ACR_APP_SECRET=$(cat ~/cdpxacrappsecret) +echo "end: read appid and appsecret which has read access on cdpx acr" + +# Name of CDPX_ACR should be in this format :Naming convention: 'cdpx' + service tree id without '-' + two digit suffix like'00'/'01 +# suffix 00 primary and 01 secondary, and we only use primary +# This configured via pipeline variable echo "login to cdpxwindows acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET +docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password $CDPX_ACR_APP_SECRET echo "login to cdpxwindows acr:${CDPX_ACR} completed" echo "pull image from cdpxwin acr: ${CDPX_ACR}" -docker pull ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} +docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} echo "pull image from cdpxwin acr completed: ${CDPX_ACR}" echo "CI Release name:"$CI_RELEASE @@ -40,7 +48,7 @@ imagetag="win-"$CI_RELEASE$CI_IMAGE_TAG_SUFFIX echo "agentimagetag="$imagetag echo "tag windows agent image" -docker tag ${CDPX_ACR}/artifact/3170cdd2-19f0-4027-912b-1027311691a2/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} echo "login to ${CI_ACR} acr" docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET From 9061201be9b7578057479abf6e612a05ca412778 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 15 Dec 2020 12:26:25 -0800 Subject: [PATCH 048/194] add pv fluentd plugin config to helm rs config (#477) * add pv fluentd plugin to helm rs config * helm rbac permissions for pv api calls --- .../templates/omsagent-rbac.yaml | 2 +- .../templates/omsagent-rs-configmap.yaml | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index 4f7408e7c..bd4e9baf3 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -19,7 +19,7 @@ metadata: heritage: {{ .Release.Service }} rules: - apiGroups: [""] - resources: ["pods", "events", "nodes", "nodes/stats", "nodes/metrics", "nodes/spec", "nodes/proxy", "namespaces", "services"] + resources: ["pods", "events", "nodes", "nodes/stats", "nodes/metrics", "nodes/spec", "nodes/proxy", "namespaces", "services", "persistentvolumes"] verbs: ["list", "get", "watch"] - apiGroups: ["apps", "extensions", "autoscaling"] resources: ["replicasets", "deployments", "horizontalpodautoscalers"] diff --git a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml index e1bc969cb..baeedf1be 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml @@ -20,6 +20,14 @@ data: log_level debug + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug + + #Kubernetes events type kubeevents @@ -99,6 +107,21 @@ data: max_retry_wait 5m + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + type out_oms log_level debug From 064bc068f70bacec13af02f6ab74180186a98356 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 16 Dec 2020 15:22:13 -0800 Subject: [PATCH 049/194] Gangams/fix rs ooming (#473) * optimize kpi * optimize kube node inventory * add flags for events, deployments and hpa * have separate function parseNodeLimits * refactor code * fix crash * fix bug with service name * fix bugs related to get service name * update oom fix test agent * debug logs * fix service label issue * update to latest agent and enable ephemeral annotation * change stream size to 200 from 250 * update yaml * adjust chunksizes * add ruby gc env * yaml changes for cioomtest11282020-3 * telemetry to track pods latency * service count telemetry * rename variables * wip * nodes inventory telemetry * configmap changes * add emit streams in configmap * yaml updates * fix copy and paste bug * add todo comments * fix node latency telemetry bug * update yaml with latest test image * fix bug * upping rs memory change * fix mdm bug with final emit stream * update to latest image * fix pr feedback * fix pr feedback * rename health config to agent config * fix max allowed hpa chunk size * update to use 1k pod chunk since validated on 1.18+ * remove debug logs * minor updates * move defaults to common place * chart updates * final oomfix agent * update to use prod image so that can be validated with build pipeline * fix typo in comment --- .../installer/datafiles/base_container.data | 2 +- .../scripts/tomlparser-agent-config.rb | 172 +++++ .../scripts/tomlparser-health-config.rb | 73 -- .../templates/omsagent-rs-configmap.yaml | 32 +- charts/azuremonitor-containers/values.yaml | 9 + kubernetes/linux/Dockerfile | 1 + kubernetes/linux/main.sh | 16 +- kubernetes/omsagent.yaml | 18 +- source/plugins/ruby/KubernetesApiClient.rb | 387 +++++----- source/plugins/ruby/in_kube_events.rb | 18 +- source/plugins/ruby/in_kube_nodes.rb | 410 ++++++---- source/plugins/ruby/in_kube_podinventory.rb | 717 ++++++++++-------- .../plugins/ruby/in_kubestate_deployments.rb | 424 ++++++----- source/plugins/ruby/in_kubestate_hpa.rb | 421 +++++----- 14 files changed, 1534 insertions(+), 1166 deletions(-) create mode 100644 build/linux/installer/scripts/tomlparser-agent-config.rb delete mode 100644 build/linux/installer/scripts/tomlparser-health-config.rb diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index ec42d5967..c680f0eea 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -123,7 +123,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root -/opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root +/opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb new file mode 100644 index 000000000..87c5194ed --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -0,0 +1,172 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + +require_relative "ConfigParseErrorLogger" + +@configMapMountPath = "/etc/config/settings/agent-settings" +@configSchemaVersion = "" +@enable_health_model = false + +# 250 Node items (15KB per node) account to approximately 4MB +@nodesChunkSize = 250 +# 1000 pods (10KB per pod) account to approximately 10MB +@podsChunkSize = 1000 +# 4000 events (1KB per event) account to approximately 4MB +@eventsChunkSize = 4000 +# roughly each deployment is 8k +# 500 deployments account to approximately 4MB +@deploymentsChunkSize = 500 +# roughly each HPA is 3k +# 2000 HPAs account to approximately 6-7MB +@hpaChunkSize = 2000 +# stream batch sizes to avoid large file writes +# too low will consume higher disk iops +@podsEmitStreamBatchSize = 200 +@nodesEmitStreamBatchSize = 100 + +# higher the chunk size rs pod memory consumption higher and lower api latency +# similarly lower the value, helps on the memory consumption but incurrs additional round trip latency +# these needs to be tuned be based on the workload +# nodes +@nodesChunkSizeMin = 100 +@nodesChunkSizeMax = 400 +# pods +@podsChunkSizeMin = 250 +@podsChunkSizeMax = 1500 +# events +@eventsChunkSizeMin = 2000 +@eventsChunkSizeMax = 10000 +# deployments +@deploymentsChunkSizeMin = 500 +@deploymentsChunkSizeMax = 1000 +# hpa +@hpaChunkSizeMin = 500 +@hpaChunkSizeMax = 2000 + +# emit stream sizes to prevent lower values which costs disk i/o +# max will be upto the chunk size +@podsEmitStreamBatchSizeMin = 50 +@nodesEmitStreamBatchSizeMin = 50 + +def is_number?(value) + true if Integer(value) rescue false +end + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for agent settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for agent settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for agent settings : #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? + if !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? + @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] + puts "enable_health_model = #{@enable_health_model}" + end + chunk_config = parsedConfig[:agent_settings][:chunk_config] + if !chunk_config.nil? + nodesChunkSize = chunk_config[:NODES_CHUNK_SIZE] + if !nodesChunkSize.nil? && is_number?(nodesChunkSize) && (@nodesChunkSizeMin..@nodesChunkSizeMax) === nodesChunkSize.to_i + @nodesChunkSize = nodesChunkSize.to_i + puts "Using config map value: NODES_CHUNK_SIZE = #{@nodesChunkSize}" + end + + podsChunkSize = chunk_config[:PODS_CHUNK_SIZE] + if !podsChunkSize.nil? && is_number?(podsChunkSize) && (@podsChunkSizeMin..@podsChunkSizeMax) === podsChunkSize.to_i + @podsChunkSize = podsChunkSize.to_i + puts "Using config map value: PODS_CHUNK_SIZE = #{@podsChunkSize}" + end + + eventsChunkSize = chunk_config[:EVENTS_CHUNK_SIZE] + if !eventsChunkSize.nil? && is_number?(eventsChunkSize) && (@eventsChunkSizeMin..@eventsChunkSizeMax) === eventsChunkSize.to_i + @eventsChunkSize = eventsChunkSize.to_i + puts "Using config map value: EVENTS_CHUNK_SIZE = #{@eventsChunkSize}" + end + + deploymentsChunkSize = chunk_config[:DEPLOYMENTS_CHUNK_SIZE] + if !deploymentsChunkSize.nil? && is_number?(deploymentsChunkSize) && (@deploymentsChunkSizeMin..@deploymentsChunkSizeMax) === deploymentsChunkSize.to_i + @deploymentsChunkSize = deploymentsChunkSize.to_i + puts "Using config map value: DEPLOYMENTS_CHUNK_SIZE = #{@deploymentsChunkSize}" + end + + hpaChunkSize = chunk_config[:HPA_CHUNK_SIZE] + if !hpaChunkSize.nil? && is_number?(hpaChunkSize) && (@hpaChunkSizeMin..@hpaChunkSizeMax) === hpaChunkSize.to_i + @hpaChunkSize = hpaChunkSize.to_i + puts "Using config map value: HPA_CHUNK_SIZE = #{@hpaChunkSize}" + end + + podsEmitStreamBatchSize = chunk_config[:PODS_EMIT_STREAM_BATCH_SIZE] + if !podsEmitStreamBatchSize.nil? && is_number?(podsEmitStreamBatchSize) && + podsEmitStreamBatchSize.to_i <= @podsChunkSize && podsEmitStreamBatchSize.to_i >= @podsEmitStreamBatchSizeMin + @podsEmitStreamBatchSize = podsEmitStreamBatchSize.to_i + puts "Using config map value: PODS_EMIT_STREAM_BATCH_SIZE = #{@podsEmitStreamBatchSize}" + end + nodesEmitStreamBatchSize = chunk_config[:NODES_EMIT_STREAM_BATCH_SIZE] + if !nodesEmitStreamBatchSize.nil? && is_number?(nodesEmitStreamBatchSize) && + nodesEmitStreamBatchSize.to_i <= @nodesChunkSize && nodesEmitStreamBatchSize.to_i >= @nodesEmitStreamBatchSizeMin + @nodesEmitStreamBatchSize = nodesEmitStreamBatchSize.to_i + puts "Using config map value: NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" + end + end + end + rescue => errorStr + puts "config::error:Exception while reading config settings for agent configuration setting - #{errorStr}, using defaults" + @enable_health_model = false + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end + @enable_health_model = false +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("agent_config_env_var", "w") + +if !file.nil? + file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") + file.write("export NODES_CHUNK_SIZE=#{@nodesChunkSize}\n") + file.write("export PODS_CHUNK_SIZE=#{@podsChunkSize}\n") + file.write("export EVENTS_CHUNK_SIZE=#{@eventsChunkSize}\n") + file.write("export DEPLOYMENTS_CHUNK_SIZE=#{@deploymentsChunkSize}\n") + file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") + file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") + file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") + # Close file after writing all environment variables + file.close +else + puts "Exception while opening file for writing config environment variables" + puts "****************End Config Processing********************" +end diff --git a/build/linux/installer/scripts/tomlparser-health-config.rb b/build/linux/installer/scripts/tomlparser-health-config.rb deleted file mode 100644 index 14c8bdb44..000000000 --- a/build/linux/installer/scripts/tomlparser-health-config.rb +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/local/bin/ruby - -#this should be require relative in Linux and require in windows, since it is a gem install on windows -@os_type = ENV["OS_TYPE"] -if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 - require "tomlrb" -else - require_relative "tomlrb" -end - -require_relative "ConfigParseErrorLogger" - -@configMapMountPath = "/etc/config/settings/agent-settings" -@configSchemaVersion = "" -@enable_health_model = false - -# Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap - begin - # Check to see if config map is created - if (File.file?(@configMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for agent health settings mounted, parsing values" - parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted config map" - return parsedConfig - else - puts "config::configmap container-azm-ms-agentconfig for agent health settings not mounted, using defaults" - return nil - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for enabling health: #{errorStr}, using defaults, please check config map for errors") - return nil - end -end - -# Use the ruby structure created after config parsing to set the right values to be used as environment variables -def populateSettingValuesFromConfigMap(parsedConfig) - begin - if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? && !parsedConfig[:agent_settings][:health_model].nil? && !parsedConfig[:agent_settings][:health_model][:enabled].nil? - @enable_health_model = parsedConfig[:agent_settings][:health_model][:enabled] - puts "enable_health_model = #{@enable_health_model}" - end - rescue => errorStr - puts "config::error:Exception while reading config settings for health_model enabled setting - #{errorStr}, using defaults" - @enable_health_model = false - end -end - -@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Config Processing********************" -if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end -else - if (File.file?(@configMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") - end - @enable_health_model = false -end - -# Write the settings to file, so that they can be set as environment variables -file = File.open("health_config_env_var", "w") - -if !file.nil? - file.write("export AZMON_CLUSTER_ENABLE_HEALTH_MODEL=#{@enable_health_model}\n") - # Close file after writing all environment variables - file.close -else - puts "Exception while opening file for writing config environment variables" - puts "****************End Config Processing********************" -end \ No newline at end of file diff --git a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml index baeedf1be..fc7c471f8 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rs-configmap.yaml @@ -95,7 +95,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer @@ -108,24 +108,24 @@ data: - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer @@ -155,7 +155,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer @@ -184,7 +184,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index e8acda20e..907e315d1 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -81,6 +81,15 @@ omsagent: deployment: affinity: nodeAffinity: + # affinity to schedule on to ephemeral os node if its available + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: storageprofile + operator: NotIn + values: + - managed requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index d04e86128..34ab133da 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -15,6 +15,7 @@ ENV HOST_VAR /hostfs/var ENV AZMON_COLLECT_ENV False ENV KUBE_CLIENT_BACKOFF_BASE 1 ENV KUBE_CLIENT_BACKOFF_DURATION 0 +ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs mdsd.xml envmdsd $tmpdir/ WORKDIR ${tmpdir} diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index a2ba6a1d1..ed16d3e32 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -171,14 +171,14 @@ done source config_env_var -#Parse the configmap to set the right environment variables for health feature. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-health-config.rb +#Parse the configmap to set the right environment variables for agent config. +/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb -cat health_config_env_var | while read line; do +cat agent_config_env_var | while read line; do #echo $line echo $line >> ~/.bashrc done -source health_config_env_var +source agent_config_env_var #Parse the configmap to set the right environment variables for network policy manager (npm) integration. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb @@ -429,7 +429,7 @@ echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc #region check to auto-activate oneagent, to route container logs, #Intent is to activate one agent routing for all managed clusters with region in the regionllist, unless overridden by configmap -# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map +# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map # AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE will have the final route that we compute & set, based on our region list logic echo "************start oneagent log routing checks************" # by default, use configmap route for safer side @@ -462,9 +462,9 @@ else echo "current region is not in oneagent regions..." fi -if [ "$isoneagentregion" = true ]; then +if [ "$isoneagentregion" = true ]; then #if configmap has a routing for logs, but current region is in the oneagent region list, take the configmap route - if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then + if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE echo "oneagent region is true for current region:$currentregion and config map logs route is not empty. so using config map logs route as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" else #there is no configmap route, so route thru oneagent @@ -511,7 +511,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then echo "starting mdsd ..." mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - + touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 fi fi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 26c7ae9a0..013e2a6c0 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -143,7 +143,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer @@ -173,7 +173,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer @@ -203,7 +203,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer @@ -232,7 +232,7 @@ data: type out_oms log_level debug - num_threads 5 + num_threads 2 buffer_chunk_limit 4m buffer_type file buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer @@ -533,7 +533,6 @@ spec: cpu: 150m memory: 250Mi env: - # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION @@ -588,6 +587,15 @@ spec: periodSeconds: 60 affinity: nodeAffinity: + # affinity to schedule on to ephemeral os node if its available + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: storageprofile + operator: NotIn + values: + - managed requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 073eb0417..aca2142a0 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -172,6 +172,10 @@ def isAROV3Cluster return @@IsAROV3Cluster end + def isAROv3MasterOrInfraPod(nodeName) + return isAROV3Cluster() && (!nodeName.nil? && (nodeName.downcase.start_with?("infra-") || nodeName.downcase.start_with?("master-"))) + end + def isNodeMaster return @@IsNodeMaster if !@@IsNodeMaster.nil? @@IsNodeMaster = false @@ -276,7 +280,8 @@ def getPods(namespace) def getWindowsNodes winNodes = [] begin - resourceUri = getNodesResourceUri("nodes") + # get only windows nodes + resourceUri = getNodesResourceUri("nodes?labelSelector=kubernetes.io%2Fos%3Dwindows") nodeInventory = JSON.parse(getKubeResourceInfo(resourceUri).body) @Log.info "KubernetesAPIClient::getWindowsNodes : Got nodes from kube api" # Resetting the windows node cache @@ -396,42 +401,67 @@ def getPodUid(podNameSpace, podMetadata) return podUid end - def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId - metricInfo = metricJSON - metricInfo["items"].each do |pod| - podNameSpace = pod["metadata"]["namespace"] - podUid = getPodUid(podNameSpace, pod["metadata"]) - if podUid.nil? - next - end - - # For ARO, skip the pods scheduled on to master or infra nodes to ingest - if isAROV3Cluster() && !pod["spec"].nil? && !pod["spec"]["nodeName"].nil? && - (pod["spec"]["nodeName"].downcase.start_with?("infra-") || - pod["spec"]["nodeName"].downcase.start_with?("master-")) - next - end + podNameSpace = pod["metadata"]["namespace"] + podUid = getPodUid(podNameSpace, pod["metadata"]) + if podUid.nil? + return metricItems + end - podContainers = [] - if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? - podContainers = podContainers + pod["spec"]["containers"] - end - # Adding init containers to the record list as well. - if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? - podContainers = podContainers + pod["spec"]["initContainers"] - end + nodeName = "" + #for unscheduled (non-started) pods nodeName does NOT exist + if !pod["spec"]["nodeName"].nil? + nodeName = pod["spec"]["nodeName"] + end + # For ARO, skip the pods scheduled on to master or infra nodes to ingest + if isAROv3MasterOrInfraPod(nodeName) + return metricItems + end - if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?) - nodeName = pod["spec"]["nodeName"] - podContainers.each do |container| - containerName = container["name"] - #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end + if (!podContainers.nil? && !podContainers.empty? && !pod["spec"]["nodeName"].nil?) + podContainers.each do |container| + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) + + metricItem = {} + metricItem["DataItems"] = [] + + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = nodeName + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = nodeName + metricProps["ObjectName"] = "K8SContainer" + metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit + else + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) + metricValue = @@NodeMetrics[nodeMetricsHashKey] + #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") metricItem = {} metricItem["DataItems"] = [] @@ -451,32 +481,6 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) - #No container level limit for the given metric, so default to node level limit - else - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) - metricValue = @@NodeMetrics[nodeMetricsHashKey] - #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = nodeName - # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = nodeName - metricProps["ObjectName"] = "K8SContainer" - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) - end end end end @@ -488,78 +492,74 @@ def getContainerResourceRequestsAndLimits(metricJSON, metricCategory, metricName return metricItems end #getContainerResourceRequestAndLimits - def getContainerResourceRequestsAndLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + def getContainerResourceRequestsAndLimitsAsInsightsMetrics(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] begin clusterId = getClusterId clusterName = getClusterName - - metricInfo = metricJSON - metricInfo["items"].each do |pod| - podNameSpace = pod["metadata"]["namespace"] - if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") - # The above case seems to be the only case where you have horizontal scaling of pods - # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash - # instead of the actual poduid. Since this uid is not being surface into the UX - # its ok to use this. - # Use kubernetes.io/config.hash to be able to correlate with cadvisor data - if pod["metadata"]["annotations"].nil? - next - else - podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] - end + podNameSpace = pod["metadata"]["namespace"] + if podNameSpace.eql?("kube-system") && !pod["metadata"].key?("ownerReferences") + # The above case seems to be the only case where you have horizontal scaling of pods + # but no controller, in which case cAdvisor picks up kubernetes.io/config.hash + # instead of the actual poduid. Since this uid is not being surface into the UX + # its ok to use this. + # Use kubernetes.io/config.hash to be able to correlate with cadvisor data + if pod["metadata"]["annotations"].nil? + return metricItems else - podUid = pod["metadata"]["uid"] + podUid = pod["metadata"]["annotations"]["kubernetes.io/config.hash"] end + else + podUid = pod["metadata"]["uid"] + end - podContainers = [] - if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? - podContainers = podContainers + pod["spec"]["containers"] - end - # Adding init containers to the record list as well. - if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? - podContainers = podContainers + pod["spec"]["initContainers"] - end + podContainers = [] + if !pod["spec"]["containers"].nil? && !pod["spec"]["containers"].empty? + podContainers = podContainers + pod["spec"]["containers"] + end + # Adding init containers to the record list as well. + if !pod["spec"]["initContainers"].nil? && !pod["spec"]["initContainers"].empty? + podContainers = podContainers + pod["spec"]["initContainers"] + end - if (!podContainers.nil? && !podContainers.empty?) - if (!pod["spec"]["nodeName"].nil?) - nodeName = pod["spec"]["nodeName"] + if (!podContainers.nil? && !podContainers.empty?) + if (!pod["spec"]["nodeName"].nil?) + nodeName = pod["spec"]["nodeName"] + else + nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU + end + podContainers.each do |container| + metricValue = nil + containerName = container["name"] + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) + metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) else - nodeName = "" #unscheduled pod. We still want to collect limits & requests for GPU - end - podContainers.each do |container| - metricValue = nil - containerName = container["name"] - #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) - metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) - else - #No container level limit for the given metric, so default to node level limit for non-gpu metrics - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect - metricValue = @@NodeMetrics[nodeMetricsHashKey] - end - end - if (!metricValue.nil?) - metricItem = {} - metricItem["CollectionTime"] = metricTime - metricItem["Computer"] = nodeName - metricItem["Name"] = metricNametoReturn - metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) + #No container level limit for the given metric, so default to node level limit for non-gpu metrics + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect + metricValue = @@NodeMetrics[nodeMetricsHashKey] end end + if (!metricValue.nil?) + metricItem = {} + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = nodeName + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + end end end rescue => error @@ -578,32 +578,9 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z metricInfo["items"].each do |node| - if (!node["status"][metricCategory].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" - metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem["DataItems"] = [] - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = node["metadata"]["name"] - # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = node["metadata"]["name"] - metricProps["ObjectName"] = "K8SNode" - metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricItem = parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime) + if !metricItem.nil? && !metricItem.empty? metricItems.push(metricItem) - #push node level metrics to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") end end rescue => error @@ -612,49 +589,82 @@ def parseNodeLimits(metricJSON, metricCategory, metricNameToCollect, metricNamet return metricItems end #parseNodeLimits - def parseNodeLimitsAsInsightsMetrics(metricJSON, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) - metricItems = [] + def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItem = {} begin - metricInfo = metricJSON clusterId = getClusterId - clusterName = getClusterName #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, #if we are coming up with the time it should be same for all nodes #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricInfo["items"].each do |node| - if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) - - # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" - metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - - metricItem = {} - metricItem["CollectionTime"] = metricTime - metricItem["Computer"] = node["metadata"]["name"] - metricItem["Name"] = metricNametoReturn - metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect - - metricItem["Tags"] = metricTags + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem["DataItems"] = [] + metricProps = {} + metricProps["Timestamp"] = metricTime + metricProps["Host"] = node["metadata"]["name"] + # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent + metricProps["Computer"] = node["metadata"]["name"] + metricProps["ObjectName"] = "K8SNode" + metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + metricProps["Collections"] = [] + metricCollections = {} + metricCollections["CounterName"] = metricNametoReturn + metricCollections["Value"] = metricValue + + metricProps["Collections"].push(metricCollections) + metricItem["DataItems"].push(metricProps) + + #push node level metrics to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") + end + rescue => error + @Log.warn("parseNodeLimitsFromNodeItem failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") + end + return metricItem + end #parseNodeLimitsFromNodeItem - metricItems.push(metricItem) - #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. - #Currently if container level cpu & memory limits are not defined we default to node level limits - if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") - @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue - #@Log.info ("Node metric hash: #{@@NodeMetrics}") - end + def parseNodeLimitsAsInsightsMetrics(node, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) + metricItem = {} + begin + #Since we are getting all node data at the same time and kubernetes doesnt specify a timestamp for the capacity and allocation metrics, + #if we are coming up with the time it should be same for all nodes + #metricTime = Time.now.utc.iso8601 #2018-01-30T19:36:14Z + if (!node["status"][metricCategory].nil?) && (!node["status"][metricCategory][metricNameToCollect].nil?) + clusterId = getClusterId + clusterName = getClusterName + + # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" or "amd.com/gpu" or "nvidia.com/gpu" + metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) + + metricItem["CollectionTime"] = metricTime + metricItem["Computer"] = node["metadata"]["name"] + metricItem["Name"] = metricNametoReturn + metricItem["Value"] = metricValue + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_VENDOR] = metricNameToCollect + + metricItem["Tags"] = metricTags + + #push node level metrics (except gpu ones) to a inmem hash so that we can use it looking up at container level. + #Currently if container level cpu & memory limits are not defined we default to node level limits + if (metricNameToCollect.downcase != "nvidia.com/gpu") && (metricNameToCollect.downcase != "amd.com/gpu") + @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue + #@Log.info ("Node metric hash: #{@@NodeMetrics}") end end rescue => error @Log.warn("parseNodeLimitsAsInsightsMetrics failed: #{error} for metric #{metricCategory} #{metricNameToCollect}") end - return metricItems + return metricItem end def getMetricNumericValue(metricName, metricVal) @@ -777,5 +787,32 @@ def getKubeAPIServerUrl end return apiServerUrl end + + def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) + kubeServiceRecords = [] + begin + if (!serviceList.nil? && !serviceList.empty?) + servicesCount = serviceList["items"].length + @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") + serviceList["items"].each do |item| + kubeServiceRecord = {} + kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + kubeServiceRecord["ServiceName"] = item["metadata"]["name"] + kubeServiceRecord["Namespace"] = item["metadata"]["namespace"] + kubeServiceRecord["SelectorLabels"] = [item["spec"]["selector"]] + # added these before emit to avoid memory foot print + # kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId + # kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServiceRecord["ClusterIP"] = item["spec"]["clusterIP"] + kubeServiceRecord["ServiceType"] = item["spec"]["type"] + kubeServiceRecords.push(kubeServiceRecord.dup) + end + end + rescue => errorStr + @Log.warn "KubernetesApiClient::getKubeServicesInventoryRecords:Failed with an error : #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return kubeServiceRecords + end end end diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 6f59a3fc1..4f6017cc5 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -17,8 +17,9 @@ def initialize require_relative "omslog" require_relative "ApplicationInsightsUtility" - # 30000 events account to approximately 5MB - @EVENTS_CHUNK_SIZE = 30000 + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @EVENTS_CHUNK_SIZE = 0 # Initializing events count for telemetry @eventsCount = 0 @@ -36,6 +37,15 @@ def configure(conf) def start if @run_interval + if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 + @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_events::start: setting to default value since got EVENTS_CHUNK_SIZE nil or empty") + @EVENTS_CHUNK_SIZE = 4000 + end + $log.info("in_kube_events::start : EVENTS_CHUNK_SIZE @ #{@EVENTS_CHUNK_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -82,6 +92,8 @@ def enumerate end $log.info("in_kube_events::enumerate : Done getting events from Kube API @ #{Time.now.utc.iso8601}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + eventsCount = eventList["items"].length + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" @@ -91,6 +103,8 @@ def enumerate while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, eventList = KubernetesApiClient.getResourcesAndContinuationToken("events?fieldSelector=type!=Normal&limit=#{@EVENTS_CHUNK_SIZE}&continue=#{continuationToken}") if (!eventList.nil? && !eventList.empty? && eventList.key?("items") && !eventList["items"].nil? && !eventList["items"].empty?) + eventsCount = eventList["items"].length + $log.info "in_kube_events::enumerate:Received number of events in eventList is #{eventsCount} @ #{Time.now.utc.iso8601}" newEventQueryState = parse_and_emit_records(eventList, eventQueryState, newEventQueryState, batchTime) else $log.warn "in_kube_events::enumerate:Received empty eventList" diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 4d58382f5..e7c5060a5 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -32,7 +32,12 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "oms_common" require_relative "omslog" - @NODES_CHUNK_SIZE = "400" + # refer tomlparser-agent-config for the defaults + @NODES_CHUNK_SIZE = 0 + @NODES_EMIT_STREAM_BATCH_SIZE = 0 + + @nodeInventoryE2EProcessingLatencyMs = 0 + @nodesAPIE2ELatencyMs = 0 require_relative "constants" end @@ -45,11 +50,30 @@ def configure(conf) def start if @run_interval + if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 + @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_nodes::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") + @NODES_CHUNK_SIZE = 250 + end + $log.info("in_kube_nodes::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") + + if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i > 0 + @NODES_EMIT_STREAM_BATCH_SIZE = ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_nodes::start: setting to default value since got NODES_EMIT_STREAM_BATCH_SIZE nil or empty") + @NODES_EMIT_STREAM_BATCH_SIZE = 100 + end + $log.info("in_kube_nodes::start : NODES_EMIT_STREAM_BATCH_SIZE @ #{@NODES_EMIT_STREAM_BATCH_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i + @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -69,14 +93,20 @@ def enumerate currentTime = Time.now batchTime = currentTime.utc.iso8601 + @nodesAPIE2ELatencyMs = 0 + @nodeInventoryE2EProcessingLatencyMs = 0 + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) - $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") + nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" @@ -84,14 +114,26 @@ def enumerate #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else $log.warn "in_kube_nodes::enumerate:Received empty nodeInventory" end end + @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime) + timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + ApplicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) + ApplicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) + @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i + end # Setting this to nil so that we dont hold memory until GC kicks in nodeInventory = nil rescue => errorStr @@ -109,77 +151,32 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) eventStream = MultiEventStream.new containerNodeInventoryEventStream = MultiEventStream.new insightsMetricsEventStream = MultiEventStream.new + kubePerfEventStream = MultiEventStream.new @@istestvar = ENV["ISTEST"] #get node inventory - nodeInventory["items"].each do |items| - record = {} - # Sending records for ContainerNodeInventory - containerNodeInventoryRecord = {} - containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - containerNodeInventoryRecord["Computer"] = items["metadata"]["name"] - - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Computer"] = items["metadata"]["name"] - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - record["CreationTimeStamp"] = items["metadata"]["creationTimestamp"] - record["Labels"] = [items["metadata"]["labels"]] - record["Status"] = "" - - if !items["spec"]["providerID"].nil? && !items["spec"]["providerID"].empty? - if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack - record["KubernetesProviderID"] = "azurestack" - else - #Multicluster kusto query is filtering after splitting by ":" to the left, so do the same here - #https://msazure.visualstudio.com/One/_git/AzureUX-Monitoring?path=%2Fsrc%2FMonitoringExtension%2FClient%2FInfraInsights%2FData%2FQueryTemplates%2FMultiClusterKustoQueryTemplate.ts&_a=contents&version=GBdev - provider = items["spec"]["providerID"].split(":")[0] - if !provider.nil? && !provider.empty? - record["KubernetesProviderID"] = provider - else - record["KubernetesProviderID"] = items["spec"]["providerID"] - end - end - else - record["KubernetesProviderID"] = "onprem" - end - - # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. - # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we - # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" - # implying that the node is ready for hosting pods, however its out of disk. - - if items["status"].key?("conditions") && !items["status"]["conditions"].empty? - allNodeConditions = "" - items["status"]["conditions"].each do |condition| - if condition["status"] == "True" - if !allNodeConditions.empty? - allNodeConditions = allNodeConditions + "," + condition["type"] - else - allNodeConditions = condition["type"] - end - end - #collect last transition to/from ready (no matter ready is true/false) - if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? - record["LastTransitionTimeReady"] = condition["lastTransitionTime"] - end - end - if !allNodeConditions.empty? - record["Status"] = allNodeConditions + nodeInventory["items"].each do |item| + # node inventory + nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) + wrapper = { + "DataType" => "KUBE_NODE_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [nodeInventoryRecord.each { |k, v| nodeInventoryRecord[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + eventStream = MultiEventStream.new end - nodeInfo = items["status"]["nodeInfo"] - record["KubeletVersion"] = nodeInfo["kubeletVersion"] - record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] - containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] - containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] - if containerRuntimeVersion.downcase.start_with?("docker://") - containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion.split("//")[1] - else - # using containerRuntimeVersion as DockerVersion as is for non docker runtimes - containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion - end - # ContainerNodeInventory data for docker version and operating system. + # container node inventory + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) containerNodeInventoryWrapper = { "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", "IPName" => "ContainerInsights", @@ -187,33 +184,81 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) } containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + containerNodeInventoryEventStream = MultiEventStream.new + end + + # node metrics records + nodeMetricRecords = [] + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "memory", "memoryAllocatableBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) + if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? + nodeMetricRecords.push(nodeMetricRecord) + end + nodeMetricRecords.each do |metricRecord| + metricRecord["DataType"] = "LINUX_PERF_BLOB" + metricRecord["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, metricRecord) if metricRecord + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = MultiEventStream.new + end + + # node GPU metrics record + nodeGPUInsightsMetricsRecords = [] + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + insightsMetricsRecord = KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(item, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime) + if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? + nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) + end + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + end + if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = MultiEventStream.new + end # Adding telemetry to send node telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - properties = {} - properties["Computer"] = record["Computer"] - properties["KubeletVersion"] = record["KubeletVersion"] - properties["OperatingSystem"] = nodeInfo["operatingSystem"] - # DockerVersion field holds docker version if runtime is docker/moby else :// - if containerRuntimeVersion.downcase.start_with?("docker://") - properties["DockerVersion"] = containerRuntimeVersion.split("//")[1] - else - properties["DockerVersion"] = containerRuntimeVersion - end - properties["KubernetesProviderID"] = record["KubernetesProviderID"] - properties["KernelVersion"] = nodeInfo["kernelVersion"] - properties["OSImage"] = nodeInfo["osImage"] + properties = getNodeTelemetryProps(item) + properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] + capacityInfo = item["status"]["capacity"] - capacityInfo = items["status"]["capacity"] ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) - begin if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] @@ -247,72 +292,32 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) telemetrySent = true end end - router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream if telemetrySent == true @@nodeTelemetryTimeTracker = DateTime.now.to_time.to_i end - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + if eventStream.count > 0 + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + eventStream = nil end - #:optimize:kubeperf merge - begin - #if(!nodeInventory.empty?) - nodeMetricDataItems = [] - #allocatable metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime)) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "allocatable", "memory", "memoryAllocatableBytes", batchTime)) - #capacity metrics @ node level - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores", batchTime)) - nodeMetricDataItems.concat(KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes", batchTime)) - - kubePerfEventStream = MultiEventStream.new - - nodeMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - kubePerfEventStream.add(emitTime, record) if record - end - #end - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - - #start GPU InsightsMetrics items - begin - nodeGPUInsightsMetricsDataItems = [] - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "nvidia.com/gpu", "nodeGpuAllocatable", batchTime)) - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "nvidia.com/gpu", "nodeGpuCapacity", batchTime)) - - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "allocatable", "amd.com/gpu", "nodeGpuAllocatable", batchTime)) - nodeGPUInsightsMetricsDataItems.concat(KubernetesApiClient.parseNodeLimitsAsInsightsMetrics(nodeInventory, "capacity", "amd.com/gpu", "nodeGpuCapacity", batchTime)) - - nodeGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => errorStr - $log.warn "Failed when processing GPU metrics in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - #end GPU InsightsMetrics items - rescue => errorStr - $log.warn "Failed in enumerate for KubePerf from in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + if containerNodeInventoryEventStream.count > 0 + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + containerNodeInventoryEventStream = nil end - #:optimize:end kubeperf merge + if kubePerfEventStream.count > 0 + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = nil + end + if insightsMetricsEventStream.count > 0 + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = nil + end rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -352,5 +357,112 @@ def run_periodic end @mutex.unlock end + + # TODO - move this method to KubernetesClient or helper class + def getNodeInventoryRecord(item, batchTime = Time.utc.iso8601) + record = {} + begin + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Computer"] = item["metadata"]["name"] + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + record["Labels"] = [item["metadata"]["labels"]] + record["Status"] = "" + + if !item["spec"]["providerID"].nil? && !item["spec"]["providerID"].empty? + if File.file?(@@AzStackCloudFileName) # existence of this file indicates agent running on azstack + record["KubernetesProviderID"] = "azurestack" + else + #Multicluster kusto query is filtering after splitting by ":" to the left, so do the same here + #https://msazure.visualstudio.com/One/_git/AzureUX-Monitoring?path=%2Fsrc%2FMonitoringExtension%2FClient%2FInfraInsights%2FData%2FQueryTemplates%2FMultiClusterKustoQueryTemplate.ts&_a=contents&version=GBdev + provider = item["spec"]["providerID"].split(":")[0] + if !provider.nil? && !provider.empty? + record["KubernetesProviderID"] = provider + else + record["KubernetesProviderID"] = item["spec"]["providerID"] + end + end + else + record["KubernetesProviderID"] = "onprem" + end + + # Refer to https://kubernetes.io/docs/concepts/architecture/nodes/#condition for possible node conditions. + # We check the status of each condition e.g. {"type": "OutOfDisk","status": "False"} . Based on this we + # populate the KubeNodeInventory Status field. A possible value for this field could be "Ready OutofDisk" + # implying that the node is ready for hosting pods, however its out of disk. + if item["status"].key?("conditions") && !item["status"]["conditions"].empty? + allNodeConditions = "" + item["status"]["conditions"].each do |condition| + if condition["status"] == "True" + if !allNodeConditions.empty? + allNodeConditions = allNodeConditions + "," + condition["type"] + else + allNodeConditions = condition["type"] + end + end + #collect last transition to/from ready (no matter ready is true/false) + if condition["type"] == "Ready" && !condition["lastTransitionTime"].nil? + record["LastTransitionTimeReady"] = condition["lastTransitionTime"] + end + end + if !allNodeConditions.empty? + record["Status"] = allNodeConditions + end + end + nodeInfo = item["status"]["nodeInfo"] + record["KubeletVersion"] = nodeInfo["kubeletVersion"] + record["KubeProxyVersion"] = nodeInfo["kubeProxyVersion"] + rescue => errorStr + $log.warn "in_kube_nodes::getNodeInventoryRecord:Failed: #{errorStr}" + end + return record + end + + # TODO - move this method to KubernetesClient or helper class + def getContainerNodeInventoryRecord(item, batchTime = Time.utc.iso8601) + containerNodeInventoryRecord = {} + begin + containerNodeInventoryRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + containerNodeInventoryRecord["Computer"] = item["metadata"]["name"] + nodeInfo = item["status"]["nodeInfo"] + containerNodeInventoryRecord["OperatingSystem"] = nodeInfo["osImage"] + containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] + if containerRuntimeVersion.downcase.start_with?("docker://") + containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion.split("//")[1] + else + # using containerRuntimeVersion as DockerVersion as is for non docker runtimes + containerNodeInventoryRecord["DockerVersion"] = containerRuntimeVersion + end + rescue => errorStr + $log.warn "in_kube_nodes::getContainerNodeInventoryRecord:Failed: #{errorStr}" + end + return containerNodeInventoryRecord + end + + # TODO - move this method to KubernetesClient or helper class + def getNodeTelemetryProps(item) + properties = {} + begin + properties["Computer"] = item["metadata"]["name"] + nodeInfo = item["status"]["nodeInfo"] + properties["KubeletVersion"] = nodeInfo["kubeletVersion"] + properties["OperatingSystem"] = nodeInfo["osImage"] + properties["KernelVersion"] = nodeInfo["kernelVersion"] + properties["OSImage"] = nodeInfo["osImage"] + containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] + if containerRuntimeVersion.downcase.start_with?("docker://") + properties["DockerVersion"] = containerRuntimeVersion.split("//")[1] + else + # using containerRuntimeVersion as DockerVersion as is for non docker runtimes + properties["DockerVersion"] = containerRuntimeVersion + end + properties["NODES_CHUNK_SIZE"] = @NODES_CHUNK_SIZE + properties["NODES_EMIT_STREAM_BATCH_SIZE"] = @NODES_EMIT_STREAM_BATCH_SIZE + rescue => errorStr + $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" + end + return properties + end end # Kube_Node_Input end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bba3e920f..0cff2eefe 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true module Fluent - require_relative "podinventory_to_mdm" + require_relative "podinventory_to_mdm" class Kube_PodInventory_Input < Input Plugin.register_input("kubepodinventory", self) @@ -19,7 +19,7 @@ def initialize require "yajl" require "set" require "time" - + require_relative "kubernetes_container_inventory" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" @@ -27,11 +27,18 @@ def initialize require_relative "omslog" require_relative "constants" - @PODS_CHUNK_SIZE = "1500" + # refer tomlparser-agent-config for updating defaults + # this configurable via configmap + @PODS_CHUNK_SIZE = 0 + @PODS_EMIT_STREAM_BATCH_SIZE = 0 + @podCount = 0 + @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} + @podInventoryE2EProcessingLatencyMs = 0 + @podsAPIE2ELatencyMs = 0 end config_param :run_interval, :time, :default => 60 @@ -44,6 +51,24 @@ def configure(conf) def start if @run_interval + if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 + @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_podinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") + @PODS_CHUNK_SIZE = 1000 + end + $log.info("in_kube_podinventory::start : PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") + + if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 + @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kube_podinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") + @PODS_EMIT_STREAM_BATCH_SIZE = 200 + end + $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -67,12 +92,15 @@ def enumerate(podList = nil) podInventory = podList telemetryFlush = false @podCount = 0 + @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 @controllerData = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 - + serviceRecords = [] + @podInventoryE2EProcessingLatencyMs = 0 + podInventoryStartTime = (Time.now.to_f * 1000).to_i # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") @@ -84,32 +112,48 @@ def enumerate(podList = nil) serviceList = Yajl::Parser.parse(StringIO.new(serviceInfo.body)) $log.info("in_kube_podinventory::enumerate:End:Parsing services data using yajl @ #{Time.now.utc.iso8601}") serviceInfo = nil + # service inventory records much smaller and fixed size compared to serviceList + serviceRecords = KubernetesApiClient.getKubeServicesInventoryRecords(serviceList, batchTime) + # updating for telemetry + @serviceCount += serviceRecords.length + serviceList = nil end + # to track e2e processing latency + @podsAPIE2ELatencyMs = 0 + podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") + podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @podsAPIE2ELatencyMs = (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) + podsAPIChunkStartTime = (Time.now.to_f * 1000).to_i continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") + podsAPIChunkEndTime = (Time.now.to_f * 1000).to_i + @podsAPIE2ELatencyMs = @podsAPIE2ELatencyMs + (podsAPIChunkEndTime - podsAPIChunkStartTime) if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + $log.info("in_kube_podinventory::enumerate : number of pod items :#{podInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end end + @podInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - podInventoryStartTime) # Setting these to nil so that we dont hold memory until GC kicks in podInventory = nil - serviceList = nil + serviceRecords = nil # Adding telemetry to send pod telemetry every 5 minutes timeDifference = (DateTime.now.to_time.to_i - @@podTelemetryTimeTracker).abs @@ -122,14 +166,19 @@ def enumerate(podList = nil) if telemetryFlush == true telemetryProperties = {} telemetryProperties["Computer"] = @@hostName + telemetryProperties["PODS_CHUNK_SIZE"] = @PODS_CHUNK_SIZE + telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {}) telemetryProperties["ControllerData"] = @controllerData.to_json ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) if @winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) end + ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyMs", @podInventoryE2EProcessingLatencyMs, telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PodsAPIE2ELatencyMs", @podsAPIE2ELatencyMs, telemetryProperties) @@podTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr @@ -137,260 +186,138 @@ def enumerate(podList = nil) $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - end + end - def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 eventStream = MultiEventStream.new + kubePerfEventStream = MultiEventStream.new + insightsMetricsEventStream = MultiEventStream.new @@istestvar = ENV["ISTEST"] begin #begin block start # Getting windows nodes from kubeapi winNodes = KubernetesApiClient.getWindowsNodesArray - - podInventory["items"].each do |items| #podInventory block start - containerInventoryRecords = [] - records = [] - record = {} - record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - record["Name"] = items["metadata"]["name"] - podNameSpace = items["metadata"]["namespace"] - - # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes - if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && - (items["spec"]["nodeName"].downcase.start_with?("infra-") || - items["spec"]["nodeName"].downcase.start_with?("master-")) - next - end - - podUid = KubernetesApiClient.getPodUid(podNameSpace, items["metadata"]) - if podUid.nil? - next - end - record["PodUid"] = podUid - record["PodLabel"] = [items["metadata"]["labels"]] - record["Namespace"] = podNameSpace - record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] - #for unscheduled (non-started) pods startTime does NOT exist - if !items["status"]["startTime"].nil? - record["PodStartTime"] = items["status"]["startTime"] - else - record["PodStartTime"] = "" - end - #podStatus - # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running - podReadyCondition = true - if !items["status"]["reason"].nil? && items["status"]["reason"] == "NodeLost" && !items["status"]["conditions"].nil? - items["status"]["conditions"].each do |condition| - if condition["type"] == "Ready" && condition["status"] == "False" - podReadyCondition = false - break - end + podInventory["items"].each do |item| #podInventory block start + # pod inventory records + podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) + podInventoryRecords.each do |record| + if !record.nil? + wrapper = { + "DataType" => "KUBE_POD_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) end end - - if podReadyCondition == false - record["PodStatus"] = "Unknown" - # ICM - https://portal.microsofticm.com/imp/v3/incidents/details/187091803/home - elsif !items["metadata"]["deletionTimestamp"].nil? && !items["metadata"]["deletionTimestamp"].empty? - record["PodStatus"] = Constants::POD_STATUS_TERMINATING - else - record["PodStatus"] = items["status"]["phase"] - end - #for unscheduled (non-started) pods podIP does NOT exist - if !items["status"]["podIP"].nil? - record["PodIp"] = items["status"]["podIP"] - else - record["PodIp"] = "" - end - #for unscheduled (non-started) pods nodeName does NOT exist - if !items["spec"]["nodeName"].nil? - record["Computer"] = items["spec"]["nodeName"] - else - record["Computer"] = "" - end - # Setting this flag to true so that we can send ContainerInventory records for containers # on windows nodes and parse environment variables for these containers if winNodes.length > 0 - if (!record["Computer"].empty? && (winNodes.include? record["Computer"])) + nodeName = "" + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + if (!nodeName.empty? && (winNodes.include? nodeName)) clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel - containerInventoryRecordsInPodItem = KubernetesContainerInventory.getContainerInventoryRecords(items, batchTime, clusterCollectEnvironmentVar, true) - containerInventoryRecordsInPodItem.each do |containerRecord| - containerInventoryRecords.push(containerRecord) - end + containerInventoryRecords = KubernetesContainerInventory.getContainerInventoryRecords(item, batchTime, clusterCollectEnvironmentVar, true) + # Send container inventory records for containers on windows nodes + @winContainerCount += containerInventoryRecords.length + containerInventoryRecords.each do |cirecord| + if !cirecord.nil? + ciwrapper = { + "DataType" => "CONTAINER_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], + } + eventStream.add(emitTime, ciwrapper) if ciwrapper + end + end end end - record["ClusterId"] = KubernetesApiClient.getClusterId - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ServiceName"] = getServiceNameFromLabels(items["metadata"]["namespace"], items["metadata"]["labels"], serviceList) - - if !items["metadata"]["ownerReferences"].nil? - record["ControllerKind"] = items["metadata"]["ownerReferences"][0]["kind"] - record["ControllerName"] = items["metadata"]["ownerReferences"][0]["name"] - @controllerSet.add(record["ControllerKind"] + record["ControllerName"]) - #Adding controller kind to telemetry ro information about customer workload - if (@controllerData[record["ControllerKind"]].nil?) - @controllerData[record["ControllerKind"]] = 1 - else - controllerValue = @controllerData[record["ControllerKind"]] - @controllerData[record["ControllerKind"]] += 1 + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + router.emit_stream(@tag, eventStream) if eventStream + eventStream = MultiEventStream.new end - podRestartCount = 0 - record["PodRestartCount"] = 0 - #Invoke the helper method to compute ready/not ready mdm metric - @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], items["status"]["conditions"]) + #container perf records + containerMetricDataItems = [] + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "cpu", "cpuRequestNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "requests", "memory", "memoryRequestBytes", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) + containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - podContainers = [] - if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? - podContainers = podContainers + items["status"]["containerStatuses"] - end - # Adding init containers to the record list as well. - if items["status"].key?("initContainerStatuses") && !items["status"]["initContainerStatuses"].empty? - podContainers = podContainers + items["status"]["initContainerStatuses"] + containerMetricDataItems.each do |record| + record["DataType"] = "LINUX_PERF_BLOB" + record["IPName"] = "LogManagement" + kubePerfEventStream.add(emitTime, record) if record end - # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start - if !podContainers.empty? #container status block start - podContainers.each do |container| - containerRestartCount = 0 - lastFinishedTime = nil - # Need this flag to determine if we need to process container data for mdm metrics like oomkilled and container restart - #container Id is of the form - #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 - if !container["containerID"].nil? - record["ContainerID"] = container["containerID"].split("//")[1] - else - # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 - record["ContainerID"] = "" - end - #keeping this as which is same as InstanceName in perf table - if podUid.nil? || container["name"].nil? - next - else - record["ContainerName"] = podUid + "/" + container["name"] - end - #Pod restart count is a sumtotal of restart counts of individual containers - #within the pod. The restart count of a container is maintained by kubernetes - #itself in the form of a container label. - containerRestartCount = container["restartCount"] - record["ContainerRestartCount"] = containerRestartCount - - containerStatus = container["state"] - record["ContainerStatusReason"] = "" - # state is of the following form , so just picking up the first key name - # "state": { - # "waiting": { - # "reason": "CrashLoopBackOff", - # "message": "Back-off 5m0s restarting failed container=metrics-server pod=metrics-server-2011498749-3g453_kube-system(5953be5f-fcae-11e7-a356-000d3ae0e432)" - # } - # }, - # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running - if podReadyCondition == false - record["ContainerStatus"] = "Unknown" - else - record["ContainerStatus"] = containerStatus.keys[0] - end - #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric - #Picking up both container and node start time from cAdvisor to be consistent - if containerStatus.keys[0] == "running" - record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] - else - if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? - record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] - end - # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm - if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB - @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) - end - end - - # Record the last state of the container. This may have information on why a container was killed. - begin - if !container["lastState"].nil? && container["lastState"].keys.length == 1 - lastStateName = container["lastState"].keys[0] - lastStateObject = container["lastState"][lastStateName] - if !lastStateObject.is_a?(Hash) - raise "expected a hash object. This could signify a bug or a kubernetes API change" - end - - if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") - newRecord = Hash.new - newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) - lastStateReason = lastStateObject["reason"] - # newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) - newRecord["reason"] = lastStateReason # (ex: OOMKilled) - newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) - lastFinishedTime = lastStateObject["finishedAt"] - newRecord["finishedAt"] = lastFinishedTime # (ex: 2019-07-02T14:58:52Z) - - # only write to the output field if everything previously ran without error - record["ContainerLastStatus"] = newRecord - - #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled - if lastStateReason.downcase == Constants::REASON_OOM_KILLED - @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - lastStateReason = nil - else - record["ContainerLastStatus"] = Hash.new - end - else - record["ContainerLastStatus"] = Hash.new - end - - #Populate mdm metric for container restart count if greater than 0 - if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) - @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) - end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - record["ContainerLastStatus"] = Hash.new - end + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = MultiEventStream.new + end - podRestartCount += containerRestartCount - records.push(record.dup) - end - else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod - records.push(record) - end #container status block end - records.each do |record| - if !record.nil? - record["PodRestartCount"] = podRestartCount - wrapper = { - "DataType" => "KUBE_POD_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) - end + # container GPU records + containerGPUInsightsMetricsDataItems = [] + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) + containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(emitTime, wrapper) if wrapper end - # Send container inventory records for containers on windows nodes - @winContainerCount += containerInventoryRecords.length - containerInventoryRecords.each do |cirecord| - if !cirecord.nil? - ciwrapper = { - "DataType" => "CONTAINER_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], - } - eventStream.add(emitTime, ciwrapper) if ciwrapper + + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = MultiEventStream.new end end #podInventory block end - router.emit_stream(@tag, eventStream) if eventStream + if eventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + eventStream = nil + end + + if kubePerfEventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = nil + end + + if insightsMetricsEventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + insightsMetricsEventStream = nil + end - if continuationToken.nil? #no more chunks in this batch to be sent, get all pod inventory records to send + if continuationToken.nil? #no more chunks in this batch to be sent, get all mdm pod inventory records to send @log.info "Sending pod inventory mdm records to out_mdm" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" @@ -401,101 +328,36 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi router.emit_stream(@@MDMKubePodInventoryTag, mdm_pod_inventory_es) if mdm_pod_inventory_es end - #:optimize:kubeperf merge - begin - #if(!podInventory.empty?) - containerMetricDataItems = [] - #hostName = (OMS::Common.get_hostname) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "cpu", "cpuRequestNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "requests", "memory", "memoryRequestBytes", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "cpu", "cpuLimitNanoCores", batchTime)) - containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(podInventory, "limits", "memory", "memoryLimitBytes", batchTime)) - - kubePerfEventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new - - containerMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - kubePerfEventStream.add(emitTime, record) if record - end - #end - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - - begin - #start GPU InsightsMetrics items - - containerGPUInsightsMetricsDataItems = [] - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "nvidia.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) - - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) - containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(podInventory, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - #end GPU InsightsMetrics items - rescue => errorStr - $log.warn "Failed when processing GPU metrics in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record for KubePerf from in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - #:optimize:end kubeperf merge - - #:optimize:start kubeservices merge - begin - if (!serviceList.nil? && !serviceList.empty?) - kubeServicesEventStream = MultiEventStream.new - serviceList["items"].each do |items| - kubeServiceRecord = {} - kubeServiceRecord["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated - kubeServiceRecord["ServiceName"] = items["metadata"]["name"] - kubeServiceRecord["Namespace"] = items["metadata"]["namespace"] - kubeServiceRecord["SelectorLabels"] = [items["spec"]["selector"]] + if continuationToken.nil? # sending kube services inventory records + kubeServicesEventStream = MultiEventStream.new + serviceRecords.each do |kubeServiceRecord| + if !kubeServiceRecord.nil? + # adding before emit to reduce memory foot print kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName - kubeServiceRecord["ClusterIP"] = items["spec"]["clusterIP"] - kubeServiceRecord["ServiceType"] = items["spec"]["type"] - # : Add ports and status fields kubeServicewrapper = { "DataType" => "KUBE_SERVICES_BLOB", "IPName" => "ContainerInsights", "DataItems" => [kubeServiceRecord.each { |k, v| kubeServiceRecord[k] = v }], } kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper + if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE + $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + kubeServicesEventStream = MultiEventStream.new + end end + end + + if kubeServicesEventStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream end - rescue => errorStr - $log.warn "Failed in parse_and_emit_record for KubeServices from in_kube_podinventory : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + kubeServicesEventStream = nil end - #:optimize:end kubeservices merge #Updating value for AppInsights telemetry @podCount += podInventory["items"].length - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) - $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end rescue => errorStr $log.warn "Failed in parse_and_emit_record pod inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -535,25 +397,238 @@ def run_periodic @mutex.unlock end - def getServiceNameFromLabels(namespace, labels, serviceList) + # TODO - move this method to KubernetesClient or helper class + def getPodInventoryRecords(item, serviceRecords, batchTime = Time.utc.iso8601) + records = [] + record = {} + + begin + record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated + record["Name"] = item["metadata"]["name"] + podNameSpace = item["metadata"]["namespace"] + podUid = KubernetesApiClient.getPodUid(podNameSpace, item["metadata"]) + if podUid.nil? + return records + end + + nodeName = "" + #for unscheduled (non-started) pods nodeName does NOT exist + if !item["spec"]["nodeName"].nil? + nodeName = item["spec"]["nodeName"] + end + # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes + if KubernetesApiClient.isAROv3MasterOrInfraPod(nodeName) + return records + end + + record["PodUid"] = podUid + record["PodLabel"] = [item["metadata"]["labels"]] + record["Namespace"] = podNameSpace + record["PodCreationTimeStamp"] = item["metadata"]["creationTimestamp"] + #for unscheduled (non-started) pods startTime does NOT exist + if !item["status"]["startTime"].nil? + record["PodStartTime"] = item["status"]["startTime"] + else + record["PodStartTime"] = "" + end + #podStatus + # the below is for accounting 'NodeLost' scenario, where-in the pod(s) in the lost node is still being reported as running + podReadyCondition = true + if !item["status"]["reason"].nil? && item["status"]["reason"] == "NodeLost" && !item["status"]["conditions"].nil? + item["status"]["conditions"].each do |condition| + if condition["type"] == "Ready" && condition["status"] == "False" + podReadyCondition = false + break + end + end + end + if podReadyCondition == false + record["PodStatus"] = "Unknown" + # ICM - https://portal.microsofticm.com/imp/v3/incidents/details/187091803/home + elsif !item["metadata"]["deletionTimestamp"].nil? && !item["metadata"]["deletionTimestamp"].empty? + record["PodStatus"] = Constants::POD_STATUS_TERMINATING + else + record["PodStatus"] = item["status"]["phase"] + end + #for unscheduled (non-started) pods podIP does NOT exist + if !item["status"]["podIP"].nil? + record["PodIp"] = item["status"]["podIP"] + else + record["PodIp"] = "" + end + + record["Computer"] = nodeName + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ServiceName"] = getServiceNameFromLabels(item["metadata"]["namespace"], item["metadata"]["labels"], serviceRecords) + + if !item["metadata"]["ownerReferences"].nil? + record["ControllerKind"] = item["metadata"]["ownerReferences"][0]["kind"] + record["ControllerName"] = item["metadata"]["ownerReferences"][0]["name"] + @controllerSet.add(record["ControllerKind"] + record["ControllerName"]) + #Adding controller kind to telemetry ro information about customer workload + if (@controllerData[record["ControllerKind"]].nil?) + @controllerData[record["ControllerKind"]] = 1 + else + controllerValue = @controllerData[record["ControllerKind"]] + @controllerData[record["ControllerKind"]] += 1 + end + end + podRestartCount = 0 + record["PodRestartCount"] = 0 + + #Invoke the helper method to compute ready/not ready mdm metric + @inventoryToMdmConvertor.process_record_for_pods_ready_metric(record["ControllerName"], record["Namespace"], item["status"]["conditions"]) + + podContainers = [] + if item["status"].key?("containerStatuses") && !item["status"]["containerStatuses"].empty? + podContainers = podContainers + item["status"]["containerStatuses"] + end + # Adding init containers to the record list as well. + if item["status"].key?("initContainerStatuses") && !item["status"]["initContainerStatuses"].empty? + podContainers = podContainers + item["status"]["initContainerStatuses"] + end + # if items["status"].key?("containerStatuses") && !items["status"]["containerStatuses"].empty? #container status block start + if !podContainers.empty? #container status block start + podContainers.each do |container| + containerRestartCount = 0 + lastFinishedTime = nil + # Need this flag to determine if we need to process container data for mdm metrics like oomkilled and container restart + #container Id is of the form + #docker://dfd9da983f1fd27432fb2c1fe3049c0a1d25b1c697b2dc1a530c986e58b16527 + if !container["containerID"].nil? + record["ContainerID"] = container["containerID"].split("//")[1] + else + # for containers that have image issues (like invalid image/tag etc..) this will be empty. do not make it all 0 + record["ContainerID"] = "" + end + #keeping this as which is same as InstanceName in perf table + if podUid.nil? || container["name"].nil? + next + else + record["ContainerName"] = podUid + "/" + container["name"] + end + #Pod restart count is a sumtotal of restart counts of individual containers + #within the pod. The restart count of a container is maintained by kubernetes + #itself in the form of a container label. + containerRestartCount = container["restartCount"] + record["ContainerRestartCount"] = containerRestartCount + + containerStatus = container["state"] + record["ContainerStatusReason"] = "" + # state is of the following form , so just picking up the first key name + # "state": { + # "waiting": { + # "reason": "CrashLoopBackOff", + # "message": "Back-off 5m0s restarting failed container=metrics-server pod=metrics-server-2011498749-3g453_kube-system(5953be5f-fcae-11e7-a356-000d3ae0e432)" + # } + # }, + # the below is for accounting 'NodeLost' scenario, where-in the containers in the lost node/pod(s) is still being reported as running + if podReadyCondition == false + record["ContainerStatus"] = "Unknown" + else + record["ContainerStatus"] = containerStatus.keys[0] + end + #TODO : Remove ContainerCreationTimeStamp from here since we are sending it as a metric + #Picking up both container and node start time from cAdvisor to be consistent + if containerStatus.keys[0] == "running" + record["ContainerCreationTimeStamp"] = container["state"]["running"]["startedAt"] + else + if !containerStatus[containerStatus.keys[0]]["reason"].nil? && !containerStatus[containerStatus.keys[0]]["reason"].empty? + record["ContainerStatusReason"] = containerStatus[containerStatus.keys[0]]["reason"] + end + # Process the record to see if job was completed 6 hours ago. If so, send metric to mdm + if !record["ControllerKind"].nil? && record["ControllerKind"].downcase == Constants::CONTROLLER_KIND_JOB + @inventoryToMdmConvertor.process_record_for_terminated_job_metric(record["ControllerName"], record["Namespace"], containerStatus) + end + end + + # Record the last state of the container. This may have information on why a container was killed. + begin + if !container["lastState"].nil? && container["lastState"].keys.length == 1 + lastStateName = container["lastState"].keys[0] + lastStateObject = container["lastState"][lastStateName] + if !lastStateObject.is_a?(Hash) + raise "expected a hash object. This could signify a bug or a kubernetes API change" + end + + if lastStateObject.key?("reason") && lastStateObject.key?("startedAt") && lastStateObject.key?("finishedAt") + newRecord = Hash.new + newRecord["lastState"] = lastStateName # get the name of the last state (ex: terminated) + lastStateReason = lastStateObject["reason"] + # newRecord["reason"] = lastStateObject["reason"] # (ex: OOMKilled) + newRecord["reason"] = lastStateReason # (ex: OOMKilled) + newRecord["startedAt"] = lastStateObject["startedAt"] # (ex: 2019-07-02T14:58:51Z) + lastFinishedTime = lastStateObject["finishedAt"] + newRecord["finishedAt"] = lastFinishedTime # (ex: 2019-07-02T14:58:52Z) + + # only write to the output field if everything previously ran without error + record["ContainerLastStatus"] = newRecord + + #Populate mdm metric for OOMKilled container count if lastStateReason is OOMKilled + if lastStateReason.downcase == Constants::REASON_OOM_KILLED + @inventoryToMdmConvertor.process_record_for_oom_killed_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + lastStateReason = nil + else + record["ContainerLastStatus"] = Hash.new + end + else + record["ContainerLastStatus"] = Hash.new + end + + #Populate mdm metric for container restart count if greater than 0 + if (!containerRestartCount.nil? && (containerRestartCount.is_a? Integer) && containerRestartCount > 0) + @inventoryToMdmConvertor.process_record_for_container_restarts_metric(record["ControllerName"], record["Namespace"], lastFinishedTime) + end + rescue => errorStr + $log.warn "Failed in parse_and_emit_record pod inventory while processing ContainerLastStatus: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + record["ContainerLastStatus"] = Hash.new + end + + podRestartCount += containerRestartCount + records.push(record.dup) + end + else # for unscheduled pods there are no status.containerStatuses, in this case we still want the pod + records.push(record) + end #container status block end + + records.each do |record| + if !record.nil? + record["PodRestartCount"] = podRestartCount + end + end + rescue => error + $log.warn("getPodInventoryRecords failed: #{error}") + end + return records + end + + # TODO - move this method to KubernetesClient or helper class + def getServiceNameFromLabels(namespace, labels, serviceRecords) serviceName = "" begin if !labels.nil? && !labels.empty? - if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].empty?) - serviceList["items"].each do |item| - found = 0 - if !item["spec"].nil? && !item["spec"]["selector"].nil? && item["metadata"]["namespace"] == namespace - selectorLabels = item["spec"]["selector"] - if !selectorLabels.empty? - selectorLabels.each do |key, value| - if !(labels.select { |k, v| k == key && v == value }.length > 0) - break - end - found = found + 1 + serviceRecords.each do |kubeServiceRecord| + found = 0 + if kubeServiceRecord["Namespace"] == namespace + selectorLabels = {} + # selector labels wrapped in array in kube service records so unwrapping here + if !kubeServiceRecord["SelectorLabels"].nil? && kubeServiceRecord["SelectorLabels"].length > 0 + selectorLabels = kubeServiceRecord["SelectorLabels"][0] + end + if !selectorLabels.nil? && !selectorLabels.empty? + selectorLabels.each do |key, value| + if !(labels.select { |k, v| k == key && v == value }.length > 0) + break end + found = found + 1 end + # service can have no selectors if found == selectorLabels.length - return item["metadata"]["name"] + return kubeServiceRecord["ServiceName"] end end end diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index bcf397150..27e4709a2 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -2,230 +2,238 @@ # frozen_string_literal: true module Fluent - class Kube_Kubestate_Deployments_Input < Input - Plugin.register_input("kubestatedeployments", self) - @@istestvar = ENV["ISTEST"] - # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m - @@deploymentsCount = 0 - - - - def initialize - super - require "yajl/json_gem" - require "yajl" - require "date" - require "time" - - require_relative "KubernetesApiClient" - require_relative "oms_common" - require_relative "omslog" - require_relative "ApplicationInsightsUtility" - require_relative "constants" - - # roughly each deployment is 8k - # 1000 deployments account to approximately 8MB - @DEPLOYMENTS_CHUNK_SIZE = 1000 - @DEPLOYMENTS_API_GROUP = "apps" - @@telemetryLastSentTime = DateTime.now.to_time.to_i - - - @deploymentsRunningTotal = 0 - - @NodeName = OMS::Common.get_hostname - @ClusterId = KubernetesApiClient.getClusterId - @ClusterName = KubernetesApiClient.getClusterName - end - - config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG - - def configure(conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + class Kube_Kubestate_Deployments_Input < Input + Plugin.register_input("kubestatedeployments", self) + @@istestvar = ENV["ISTEST"] + # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m + @@deploymentsCount = 0 + + def initialize + super + require "yajl/json_gem" + require "yajl" + require "date" + require "time" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + require_relative "constants" + + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @DEPLOYMENTS_CHUNK_SIZE = 0 + + @DEPLOYMENTS_API_GROUP = "apps" + @@telemetryLastSentTime = DateTime.now.to_time.to_i + + @deploymentsRunningTotal = 0 + + @NodeName = OMS::Common.get_hostname + @ClusterId = KubernetesApiClient.getClusterId + @ClusterName = KubernetesApiClient.getClusterName + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + + def configure(conf) + super + end + + def start + if @run_interval + if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 + @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kubestate_deployments::start: setting to default value since got DEPLOYMENTS_CHUNK_SIZE nil or empty") + @DEPLOYMENTS_CHUNK_SIZE = 500 end + $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") + + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def enumerate - begin - deploymentList = nil - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - - #set the running total for this batch to 0 - @deploymentsRunningTotal = 0 - - # Initializing continuation token to nil - continuationToken = nil - $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) - $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") + end + + def enumerate + begin + deploymentList = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + #set the running total for this batch to 0 + @deploymentsRunningTotal = 0 + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}", api_group: @DEPLOYMENTS_API_GROUP) + $log.info("in_kubestate_deployments::enumerate : Done getting deployments from Kube API @ #{Time.now.utc.iso8601}") + if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} from Kube API @ #{Time.now.utc.iso8601}") + parse_and_emit_records(deploymentList, batchTime) + else + $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) + $log.info("in_kubestate_deployments::enumerate : number of deployment items :#{deploymentList["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(deploymentList, batchTime) else $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, deploymentList = KubernetesApiClient.getResourcesAndContinuationToken("deployments?limit=#{@DEPLOYMENTS_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @DEPLOYMENTS_API_GROUP) - if (!deploymentList.nil? && !deploymentList.empty? && deploymentList.key?("items") && !deploymentList["items"].nil? && !deploymentList["items"].empty?) - parse_and_emit_records(deploymentList, batchTime) - else - $log.warn "in_kubestate_deployments::enumerate:Received empty deploymentList" - end + end + + # Setting this to nil so that we dont hold memory until GC kicks in + deploymentList = nil + + $log.info("successfully emitted a total of #{@deploymentsRunningTotal} kube_state_deployment metrics") + # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 + if (@deploymentsRunningTotal > @@deploymentsCount) + @@deploymentsCount = @deploymentsRunningTotal + end + if (((DateTime.now.to_time.to_i - @@telemetryLastSentTime).abs) / 60) >= Constants::KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES + #send telemetry + $log.info "sending deployemt telemetry..." + ApplicationInsightsUtility.sendMetricTelemetry("MaxDeploymentCount", @@deploymentsCount, {}) + #reset last sent value & time + @@deploymentsCount = 0 + @@telemetryLastSentTime = DateTime.now.to_time.to_i + end + rescue => errorStr + $log.warn "in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}") + end + end # end enumerate + + def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) + metricItems = [] + insightsMetricsEventStream = MultiEventStream.new + begin + metricInfo = deployments + metricInfo["items"].each do |deployment| + deploymentName = deployment["metadata"]["name"] + deploymentNameSpace = deployment["metadata"]["namespace"] + deploymentCreatedTime = "" + if !deployment["metadata"]["creationTimestamp"].nil? + deploymentCreatedTime = deployment["metadata"]["creationTimestamp"] + end + deploymentStrategy = "RollingUpdate" #default when not specified as per spec + if !deployment["spec"]["strategy"].nil? && !deployment["spec"]["strategy"]["type"].nil? + deploymentStrategy = deployment["spec"]["strategy"]["type"] end - - # Setting this to nil so that we dont hold memory until GC kicks in - deploymentList = nil - - $log.info("successfully emitted a total of #{@deploymentsRunningTotal} kube_state_deployment metrics") - # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 - if (@deploymentsRunningTotal > @@deploymentsCount) - @@deploymentsCount = @deploymentsRunningTotal + deploymentSpecReplicas = 1 #default is 1 as per k8s spec + if !deployment["spec"]["replicas"].nil? + deploymentSpecReplicas = deployment["spec"]["replicas"] end - if (((DateTime.now.to_time.to_i - @@telemetryLastSentTime).abs)/60 ) >= Constants::KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES - #send telemetry - $log.info "sending deployemt telemetry..." - ApplicationInsightsUtility.sendMetricTelemetry("MaxDeploymentCount", @@deploymentsCount, {}) - #reset last sent value & time - @@deploymentsCount = 0 - @@telemetryLastSentTime = DateTime.now.to_time.to_i + deploymentStatusReadyReplicas = 0 + if !deployment["status"]["readyReplicas"].nil? + deploymentStatusReadyReplicas = deployment["status"]["readyReplicas"] end - rescue => errorStr - $log.warn "in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::enumerate:Failed in enumerate: #{errorStr}") + deploymentStatusUpToDateReplicas = 0 + if !deployment["status"]["updatedReplicas"].nil? + deploymentStatusUpToDateReplicas = deployment["status"]["updatedReplicas"] + end + deploymentStatusAvailableReplicas = 0 + if !deployment["status"]["availableReplicas"].nil? + deploymentStatusAvailableReplicas = deployment["status"]["availableReplicas"] + end + + metricItem = {} + metricItem["CollectionTime"] = batchTime + metricItem["Computer"] = @NodeName + metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE + metricItem["Value"] = deploymentStatusReadyReplicas + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME] = deploymentName + metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = deploymentNameSpace + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY] = deploymentStrategy + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = deploymentCreatedTime + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS] = deploymentSpecReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED] = deploymentStatusUpToDateReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE] = deploymentStatusAvailableReplicas + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) end - end # end enumerate - - def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) - metricItems = [] - insightsMetricsEventStream = MultiEventStream.new - begin - metricInfo = deployments - metricInfo["items"].each do |deployment| - deploymentName = deployment["metadata"]["name"] - deploymentNameSpace = deployment["metadata"]["namespace"] - deploymentCreatedTime = "" - if !deployment["metadata"]["creationTimestamp"].nil? - deploymentCreatedTime = deployment["metadata"]["creationTimestamp"] - end - deploymentStrategy = "RollingUpdate" #default when not specified as per spec - if !deployment["spec"]["strategy"].nil? && !deployment["spec"]["strategy"]["type"].nil? - deploymentStrategy = deployment["spec"]["strategy"]["type"] - end - deploymentSpecReplicas = 1 #default is 1 as per k8s spec - if !deployment["spec"]["replicas"].nil? - deploymentSpecReplicas = deployment["spec"]["replicas"] - end - deploymentStatusReadyReplicas = 0 - if !deployment["status"]["readyReplicas"].nil? - deploymentStatusReadyReplicas = deployment["status"]["readyReplicas"] - end - deploymentStatusUpToDateReplicas = 0 - if !deployment["status"]["updatedReplicas"].nil? - deploymentStatusUpToDateReplicas = deployment["status"]["updatedReplicas"] - end - deploymentStatusAvailableReplicas = 0 - if !deployment["status"]["availableReplicas"].nil? - deploymentStatusAvailableReplicas = deployment["status"]["availableReplicas"] - end - - metricItem = {} - metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = @NodeName - metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_DEPLOYMENT_STATE - metricItem["Value"] = deploymentStatusReadyReplicas - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_NAME] = deploymentName - metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = deploymentNameSpace - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STRATEGY ] = deploymentStrategy - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = deploymentCreatedTime - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_SPEC_REPLICAS] = deploymentSpecReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_UPDATED] = deploymentStatusUpToDateReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_DEPLOYMENT_STATUS_REPLICAS_AVAILABLE] = deploymentStatusAvailableReplicas - - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) - end - - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") - @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubestatedeploymentsInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => error - $log.warn("in_kubestate_deployments::parse_and_emit_records failed: #{error} ") - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::parse_and_emit_records failed: #{error}") + + time = Time.now.to_f + metricItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") + + @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubestatedeploymentsInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - + rescue => error + $log.warn("in_kubestate_deployments::parse_and_emit_records failed: #{error} ") + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::parse_and_emit_records failed: #{error}") end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished - @nextTimeToRun = Time.now - @waitTimeout = @run_interval - until done - @nextTimeToRun = @nextTimeToRun + @run_interval - @now = Time.now - if @nextTimeToRun <= @now - @waitTimeout = 1 - @nextTimeToRun = @now - else - @waitTimeout = @nextTimeToRun - @now - end - @condition.wait(@mutex, @waitTimeout) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kubestate_deployments::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") - enumerate - $log.info("in_kubestate_deployments::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn "in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}") - end + @mutex.unlock + if !done + begin + $log.info("in_kubestate_deployments::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kubestate_deployments::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_deployments::run_periodic: enumerate Failed to retrieve kube deployments: #{errorStr}") end - @mutex.lock end - @mutex.unlock + @mutex.lock end + @mutex.unlock end -end \ No newline at end of file + end +end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index 3ce63a75a..afecf8e3b 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -2,231 +2,236 @@ # frozen_string_literal: true module Fluent - class Kube_Kubestate_HPA_Input < Input - Plugin.register_input("kubestatehpa", self) - @@istestvar = ENV["ISTEST"] - - - def initialize - super - require "yajl/json_gem" - require "yajl" - require "time" - - require_relative "KubernetesApiClient" - require_relative "oms_common" - require_relative "omslog" - require_relative "ApplicationInsightsUtility" - require_relative "constants" - - # roughly each HPA is 3k - # 2000 HPAs account to approximately 6-7MB - @HPA_CHUNK_SIZE = 2000 - @HPA_API_GROUP = "autoscaling" - - # telemetry - @hpaCount = 0 - - @NodeName = OMS::Common.get_hostname - @ClusterId = KubernetesApiClient.getClusterId - @ClusterName = KubernetesApiClient.getClusterName - end - - config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG - - def configure(conf) - super - end - - def start - if @run_interval - @finished = false - @condition = ConditionVariable.new - @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + class Kube_Kubestate_HPA_Input < Input + Plugin.register_input("kubestatehpa", self) + @@istestvar = ENV["ISTEST"] + + def initialize + super + require "yajl/json_gem" + require "yajl" + require "time" + + require_relative "KubernetesApiClient" + require_relative "oms_common" + require_relative "omslog" + require_relative "ApplicationInsightsUtility" + require_relative "constants" + + # refer tomlparser-agent-config for defaults + # this configurable via configmap + @HPA_CHUNK_SIZE = 0 + + @HPA_API_GROUP = "autoscaling" + + # telemetry + @hpaCount = 0 + + @NodeName = OMS::Common.get_hostname + @ClusterId = KubernetesApiClient.getClusterId + @ClusterName = KubernetesApiClient.getClusterName + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + + def configure(conf) + super + end + + def start + if @run_interval + if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 + @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"].to_i + else + # this shouldnt happen just setting default here as safe guard + $log.warn("in_kubestate_hpa::start: setting to default value since got HPA_CHUNK_SIZE nil or empty") + @HPA_CHUNK_SIZE = 2000 end + $log.info("in_kubestate_hpa::start : HPA_CHUNK_SIZE @ #{@HPA_CHUNK_SIZE}") + + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) end - - def shutdown - if @run_interval - @mutex.synchronize { - @finished = true - @condition.signal - } - @thread.join - end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join end - - def enumerate - begin - hpaList = nil - currentTime = Time.now - batchTime = currentTime.utc.iso8601 - - @hpaCount = 0 - - # Initializing continuation token to nil - continuationToken = nil - $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}", api_group: @HPA_API_GROUP) - $log.info("in_kubestate_hpa::enumerate : Done getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + end + + def enumerate + begin + hpaList = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + @hpaCount = 0 + + # Initializing continuation token to nil + continuationToken = nil + $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}", api_group: @HPA_API_GROUP) + $log.info("in_kubestate_hpa::enumerate : Done getting HPAs from Kube API @ #{Time.now.utc.iso8601}") + if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) + parse_and_emit_records(hpaList, batchTime) + else + $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @HPA_API_GROUP) if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) parse_and_emit_records(hpaList, batchTime) else $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" end - - #If we receive a continuation token, make calls, process and flush data until we have processed all data - while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, hpaList = KubernetesApiClient.getResourcesAndContinuationToken("horizontalpodautoscalers?limit=#{@HPA_CHUNK_SIZE}&continue=#{continuationToken}", api_group: @HPA_API_GROUP) - if (!hpaList.nil? && !hpaList.empty? && hpaList.key?("items") && !hpaList["items"].nil? && !hpaList["items"].empty?) - parse_and_emit_records(hpaList, batchTime) - else - $log.warn "in_kubestate_hpa::enumerate:Received empty hpaList" + end + + # Setting this to nil so that we dont hold memory until GC kicks in + hpaList = nil + + # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 + if (@hpaCount > 0) + # this will not be a useful telemetry, as hpa counts will not be huge, just log for now + $log.info("in_kubestate_hpa::hpaCount= #{hpaCount}") + #ApplicationInsightsUtility.sendMetricTelemetry("HPACount", @hpaCount, {}) + end + rescue => errorStr + $log.warn "in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}") + end + end # end enumerate + + def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) + metricItems = [] + insightsMetricsEventStream = MultiEventStream.new + begin + metricInfo = hpas + metricInfo["items"].each do |hpa| + hpaName = hpa["metadata"]["name"] + hpaNameSpace = hpa["metadata"]["namespace"] + hpaCreatedTime = "" + if !hpa["metadata"]["creationTimestamp"].nil? + hpaCreatedTime = hpa["metadata"]["creationTimestamp"] + end + hpaSpecMinReplicas = 1 #default is 1 as per k8s spec + if !hpa["spec"]["minReplicas"].nil? + hpaSpecMinReplicas = hpa["spec"]["minReplicas"] + end + hpaSpecMaxReplicas = 0 + if !hpa["spec"]["maxReplicas"].nil? + hpaSpecMaxReplicas = hpa["spec"]["maxReplicas"] + end + hpaSpecScaleTargetKind = "" + hpaSpecScaleTargetName = "" + if !hpa["spec"]["scaleTargetRef"].nil? + if !hpa["spec"]["scaleTargetRef"]["kind"].nil? + hpaSpecScaleTargetKind = hpa["spec"]["scaleTargetRef"]["kind"] + end + if !hpa["spec"]["scaleTargetRef"]["name"].nil? + hpaSpecScaleTargetName = hpa["spec"]["scaleTargetRef"]["name"] end end - - # Setting this to nil so that we dont hold memory until GC kicks in - hpaList = nil - - # Flush AppInsights telemetry once all the processing is done, only if the number of events flushed is greater than 0 - if (@hpaCount > 0) - # this will not be a useful telemetry, as hpa counts will not be huge, just log for now - $log.info("in_kubestate_hpa::hpaCount= #{hpaCount}") - #ApplicationInsightsUtility.sendMetricTelemetry("HPACount", @hpaCount, {}) + hpaStatusCurrentReplicas = 0 + if !hpa["status"]["currentReplicas"].nil? + hpaStatusCurrentReplicas = hpa["status"]["currentReplicas"] end - rescue => errorStr - $log.warn "in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::enumerate:Failed in enumerate: #{errorStr}") + hpaStatusDesiredReplicas = 0 + if !hpa["status"]["desiredReplicas"].nil? + hpaStatusDesiredReplicas = hpa["status"]["desiredReplicas"] + end + + hpaStatuslastScaleTime = "" + if !hpa["status"]["lastScaleTime"].nil? + hpaStatuslastScaleTime = hpa["status"]["lastScaleTime"] + end + + metricItem = {} + metricItem["CollectionTime"] = batchTime + metricItem["Computer"] = @NodeName + metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE + metricItem["Value"] = hpaStatusCurrentReplicas + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME] = hpaName + metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = hpaNameSpace + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = hpaCreatedTime + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS] = hpaSpecMinReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS] = hpaSpecMaxReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND] = hpaSpecScaleTargetKind + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME] = hpaSpecScaleTargetName + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS] = hpaStatusDesiredReplicas + metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME] = hpaStatuslastScaleTime + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) end - end # end enumerate - - def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) - metricItems = [] - insightsMetricsEventStream = MultiEventStream.new - begin - metricInfo = hpas - metricInfo["items"].each do |hpa| - hpaName = hpa["metadata"]["name"] - hpaNameSpace = hpa["metadata"]["namespace"] - hpaCreatedTime = "" - if !hpa["metadata"]["creationTimestamp"].nil? - hpaCreatedTime = hpa["metadata"]["creationTimestamp"] - end - hpaSpecMinReplicas = 1 #default is 1 as per k8s spec - if !hpa["spec"]["minReplicas"].nil? - hpaSpecMinReplicas = hpa["spec"]["minReplicas"] - end - hpaSpecMaxReplicas = 0 - if !hpa["spec"]["maxReplicas"].nil? - hpaSpecMaxReplicas = hpa["spec"]["maxReplicas"] - end - hpaSpecScaleTargetKind = "" - hpaSpecScaleTargetName = "" - if !hpa["spec"]["scaleTargetRef"].nil? - if !hpa["spec"]["scaleTargetRef"]["kind"].nil? - hpaSpecScaleTargetKind = hpa["spec"]["scaleTargetRef"]["kind"] - end - if !hpa["spec"]["scaleTargetRef"]["name"].nil? - hpaSpecScaleTargetName = hpa["spec"]["scaleTargetRef"]["name"] - end - - end - hpaStatusCurrentReplicas = 0 - if !hpa["status"]["currentReplicas"].nil? - hpaStatusCurrentReplicas = hpa["status"]["currentReplicas"] - end - hpaStatusDesiredReplicas = 0 - if !hpa["status"]["desiredReplicas"].nil? - hpaStatusDesiredReplicas = hpa["status"]["desiredReplicas"] - end - - hpaStatuslastScaleTime = "" - if !hpa["status"]["lastScaleTime"].nil? - hpaStatuslastScaleTime = hpa["status"]["lastScaleTime"] - end - - - metricItem = {} - metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = @NodeName - metricItem["Name"] = Constants::INSIGHTSMETRICS_METRIC_NAME_KUBE_STATE_HPA_STATE - metricItem["Value"] = hpaStatusCurrentReplicas - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_KUBESTATE_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = @ClusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = @ClusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_NAME] = hpaName - metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = hpaNameSpace - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_CREATIONTIME] = hpaCreatedTime - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MIN_REPLICAS] = hpaSpecMinReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_MAX_REPLICAS] = hpaSpecMaxReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_KIND] = hpaSpecScaleTargetKind - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_SPEC_SCALE_TARGET_NAME] = hpaSpecScaleTargetName - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_DESIRED_REPLICAS] = hpaStatusDesiredReplicas - metricTags[Constants::INSIGHTSMETRICS_TAGS_KUBE_STATE_HPA_STATUS_LAST_SCALE_TIME] = hpaStatuslastScaleTime - - - metricItem["Tags"] = metricTags - - metricItems.push(metricItem) - end - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper - end - - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") - end - rescue => error - $log.warn("in_kubestate_hpa::parse_and_emit_records failed: #{error} ") - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::parse_and_emit_records failed: #{error}") + time = Time.now.to_f + metricItems.each do |insightsMetricsRecord| + wrapper = { + "DataType" => "INSIGHTS_METRICS_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], + } + insightsMetricsEventStream.add(time, wrapper) if wrapper + end + + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - + rescue => error + $log.warn("in_kubestate_hpa::parse_and_emit_records failed: #{error} ") + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::parse_and_emit_records failed: #{error}") end - - def run_periodic - @mutex.lock + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) done = @finished - @nextTimeToRun = Time.now - @waitTimeout = @run_interval - until done - @nextTimeToRun = @nextTimeToRun + @run_interval - @now = Time.now - if @nextTimeToRun <= @now - @waitTimeout = 1 - @nextTimeToRun = @now - else - @waitTimeout = @nextTimeToRun - @now - end - @condition.wait(@mutex, @waitTimeout) - done = @finished - @mutex.unlock - if !done - begin - $log.info("in_kubestate_hpa::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") - enumerate - $log.info("in_kubestate_hpa::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") - rescue => errorStr - $log.warn "in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}") - end + @mutex.unlock + if !done + begin + $log.info("in_kubestate_hpa::run_periodic.enumerate.start @ #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kubestate_hpa::run_periodic.enumerate.end @ #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry("in_kubestate_hpa::run_periodic: enumerate Failed to retrieve kube hpas: #{errorStr}") end - @mutex.lock end - @mutex.unlock + @mutex.lock end + @mutex.unlock end -end \ No newline at end of file + end +end From 9cb058c850cbfd8ed88910920cf3055b8066061b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 18 Dec 2020 14:24:37 -0800 Subject: [PATCH 050/194] Gangams/enable arc onboarding to ff (#478) * wip * updates * trigger login if the ctx cloud not same as specified cloud * add missed commit --- .../onboarding/managed/disable-monitoring.ps1 | 34 ++++++++++++--- .../onboarding/managed/disable-monitoring.sh | 17 ++++++++ .../onboarding/managed/enable-monitoring.ps1 | 43 ++++++++++++++++--- .../onboarding/managed/enable-monitoring.sh | 38 +++++++++++++--- .../onboarding/managed/upgrade-monitoring.sh | 19 +++++++- 5 files changed, 130 insertions(+), 21 deletions(-) diff --git a/scripts/onboarding/managed/disable-monitoring.ps1 b/scripts/onboarding/managed/disable-monitoring.ps1 index 1c011bfff..bcd135dba 100644 --- a/scripts/onboarding/managed/disable-monitoring.ps1 +++ b/scripts/onboarding/managed/disable-monitoring.ps1 @@ -15,6 +15,8 @@ tenantId of the service principal which will be used for the azure login .PARAMETER kubeContext (optional) kube-context of the k8 cluster to install Azure Monitor for containers HELM chart + .PARAMETER azureCloudName (optional) + Name of the Azure cloud name. Supported Azure cloud Name is AzureCloud or AzureUSGovernment Pre-requisites: - Azure Managed cluster Resource Id @@ -34,7 +36,9 @@ param( [Parameter(mandatory = $false)] [string]$tenantId, [Parameter(mandatory = $false)] - [string]$kubeContext + [string]$kubeContext, + [Parameter(mandatory = $false)] + [string]$azureCloudName ) $helmChartReleaseName = "azmon-containers-release-1" @@ -46,6 +50,21 @@ $isAksCluster = $false $isAroV4Cluster = $false $isUsingServicePrincipal = $false +if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { + Write-Host("Azure cloud name parameter not passed in so using default cloud as AzureCloud") + $azureCloudName = "AzureCloud" +} else { + if(($azureCloudName.ToLower() -eq "azurecloud" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + } elseif (($azureCloudName.ToLower() -eq "azureusgovernment" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + } else { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + Write-Host("Only supported Azure clouds are : AzureCloud and AzureUSGovernment") + exit + } +} + # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts $azResourcesModule = Get-Module -ListAvailable -Name Az.Resources @@ -226,14 +245,19 @@ Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -Foregr if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName } try { Write-Host("") Write-Host("Trying to get the current Az login context...") $account = Get-AzContext -ErrorAction Stop - Write-Host("Successfully fetched current AzContext context...") -ForegroundColor Green + $ctxCloud = $account.Environment.Name + if(($azureCloudName.ToLower() -eq $ctxCloud.ToLower() ) -eq $false) { + Write-Host("Specified azure cloud name is not same as current context cloud hence setting account to null to retrigger the login" ) -ForegroundColor Green + $account = $null + } + Write-Host("Successfully fetched current AzContext context and azure cloud name: $azureCloudName" ) -ForegroundColor Green Write-Host("") } catch { @@ -249,10 +273,10 @@ if ($null -eq $account.Account) { if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId,$spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName } else { Write-Host("Please login...") - Connect-AzAccount -subscriptionid $clusterSubscriptionId + Connect-AzAccount -subscriptionid $clusterSubscriptionId -Environment $azureCloudName } } catch { diff --git a/scripts/onboarding/managed/disable-monitoring.sh b/scripts/onboarding/managed/disable-monitoring.sh index c11426f30..d43a79f51 100644 --- a/scripts/onboarding/managed/disable-monitoring.sh +++ b/scripts/onboarding/managed/disable-monitoring.sh @@ -280,10 +280,27 @@ done } +validate_and_configure_supported_cloud() { + echo "get active azure cloud name configured to azure cli" + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]") + echo "active azure cloud name configured to azure cli: ${azureCloudName}" + if [ "$isArcK8sCluster" = true ]; then + if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then + echo "-e only supported clouds are AzureCloud and AzureUSGovernment for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + else + # For ARO v4, only supported cloud is public so just configure to public to keep the existing behavior + configure_to_public_cloud + fi +} # parse args parse_args $@ +# validate and configure azure cloud +validate_and_configure_supported_cloud + # parse cluster resource id clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index b052f22c5..7b128b112 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -22,6 +22,8 @@ .PARAMETER proxyEndpoint (optional) Provide Proxy endpoint if you have K8s cluster behind the proxy and would like to route Azure Monitor for containers outbound traffic via proxy. Format of the proxy endpoint should be http(s://:@: + .PARAMETER azureCloudName (optional) + Name of the Azure cloud name. Supported Azure cloud Name is AzureCloud or AzureUSGovernment Pre-requisites: - Azure Managed cluster Resource Id @@ -46,7 +48,9 @@ param( [Parameter(mandatory = $false)] [string]$workspaceResourceId, [Parameter(mandatory = $false)] - [string]$proxyEndpoint + [string]$proxyEndpoint, + [Parameter(mandatory = $false)] + [string]$azureCloudName ) $solutionTemplateUri = "https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/onboarding/templates/azuremonitor-containerSolution.json" @@ -63,6 +67,24 @@ $mcr = "mcr.microsoft.com" $mcrChartVersion = "2.7.9" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." +$omsAgentDomainName="opinsights.azure.com" + +if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { + Write-Host("Azure cloud name parameter not passed in so using default cloud as AzureCloud") + $azureCloudName = "AzureCloud" +} else { + if(($azureCloudName.ToLower() -eq "azurecloud" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + $omsAgentDomainName="opinsights.azure.com" + } elseif (($azureCloudName.ToLower() -eq "azureusgovernment" ) -eq $true) { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + $omsAgentDomainName="opinsights.azure.us" + } else { + Write-Host("Specified Azure Cloud name is : $azureCloudName") + Write-Host("Only supported azure clouds are : AzureCloud and AzureUSGovernment") + exit + } +} # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts @@ -244,14 +266,19 @@ Write-Host("Cluster SubscriptionId : '" + $clusterSubscriptionId + "' ") -Foregr if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId, $spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName } try { Write-Host("") Write-Host("Trying to get the current Az login context...") $account = Get-AzContext -ErrorAction Stop - Write-Host("Successfully fetched current AzContext context...") -ForegroundColor Green + $ctxCloud = $account.Environment.Name + if(($azureCloudName.ToLower() -eq $ctxCloud.ToLower() ) -eq $false) { + Write-Host("Specified azure cloud name is not same as current context cloud hence setting account to null to retrigger the login" ) -ForegroundColor Green + $account = $null + } + Write-Host("Successfully fetched current AzContext context and azure cloud name: $azureCloudName" ) -ForegroundColor Green Write-Host("") } catch { @@ -266,11 +293,12 @@ if ($null -eq $account.Account) { if ($isUsingServicePrincipal) { $spSecret = ConvertTo-SecureString -String $servicePrincipalClientSecret -AsPlainText -Force $spCreds = New-Object -TypeName "System.Management.Automation.PSCredential" -ArgumentList $servicePrincipalClientId, $spSecret - Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId + + Connect-AzAccount -ServicePrincipal -Credential $spCreds -Tenant $tenantId -Subscription $clusterSubscriptionId -Environment $azureCloudName } else { Write-Host("Please login...") - Connect-AzAccount -subscriptionid $clusterSubscriptionId + Connect-AzAccount -subscriptionid $clusterSubscriptionId -Environment $azureCloudName } } catch { @@ -380,7 +408,8 @@ if ([string]::IsNullOrEmpty($workspaceResourceId)) { "westeurope" = "westeurope" ; "westindia" = "centralindia" ; "westus" = "westus" ; - "westus2" = "westus2" + "westus2" = "westus2"; + "usgovvirginia" = "usgovvirginia" } $workspaceRegionCode = "EUS" @@ -531,7 +560,7 @@ try { Write-Host("helmChartRepoPath is : ${helmChartRepoPath}") - $helmParameters = "omsagent.secret.wsid=$workspaceGUID,omsagent.secret.key=$workspacePrimarySharedKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion" + $helmParameters = "omsagent.domain=$omsAgentDomainName,omsagent.secret.wsid=$workspaceGUID,omsagent.secret.key=$workspacePrimarySharedKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion" if ([string]::IsNullOrEmpty($proxyEndpoint) -eq $false) { Write-Host("using proxy endpoint since its provided") $helmParameters = $helmParameters + ",omsagent.proxy=$proxyEndpoint" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index bb6974258..85428aff7 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -38,8 +38,10 @@ set -e set -o pipefail -# default to public cloud since only supported cloud is azure public clod +# default to public cloud since only supported cloud is azure public cloud defaultAzureCloud="AzureCloud" +# default domain will be for public cloud +omsAgentDomainName="opinsights.azure.com" # released chart version in mcr mcrChartVersion="2.7.9" @@ -307,6 +309,25 @@ parse_args() { } +validate_and_configure_supported_cloud() { + echo "get active azure cloud name configured to azure cli" + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]") + echo "active azure cloud name configured to azure cli: ${azureCloudName}" + if [ "$isArcK8sCluster" = true ]; then + if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then + echo "-e only supported clouds are AzureCloud and AzureUSGovernment for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + if [ "$azureCloudName" = "azureusgovernment" ]; then + echo "setting omsagent domain as opinsights.azure.us since the azure cloud is azureusgovernment " + omsAgentDomainName="opinsights.azure.us" + fi + else + # For ARO v4, only supported cloud is public so just configure to public to keep the existing behavior + configure_to_public_cloud + fi +} + configure_to_public_cloud() { echo "Set AzureCloud as active cloud for az cli" az cloud set -n $defaultAzureCloud @@ -398,8 +419,10 @@ create_default_log_analytics_workspace() { [westindia]=centralindia [westus]=westus [westus2]=westus2 + [usgovvirginia]=usgovvirginia ) + echo "cluster Region:"$clusterRegion if [ -n "${AzureCloudRegionToOmsRegionMap[$clusterRegion]}" ]; then workspaceRegion=${AzureCloudRegionToOmsRegionMap[$clusterRegion]} fi @@ -433,6 +456,7 @@ create_default_log_analytics_workspace() { workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id) workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') + echo "workspace resource Id: ${workspaceResourceId}" } add_container_insights_solution() { @@ -504,18 +528,18 @@ install_helm_chart() { echo "using proxy endpoint since proxy configuration passed in" if [ -z "$kubeconfigContext" ]; then echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install $releaseName --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath else echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install $releaseName --set omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.proxy=$proxyEndpoint,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} fi else if [ -z "$kubeconfigContext" ]; then echo "using current kube-context since --kube-context/-k parameter not passed in" - helm upgrade --install $releaseName --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath else echo "using --kube-context:${kubeconfigContext} since passed in" - helm upgrade --install $releaseName --set omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} + helm upgrade --install $releaseName --set omsagent.domain=$omsAgentDomainName,omsagent.secret.wsid=$workspaceGuid,omsagent.secret.key=$workspaceKey,omsagent.env.clusterId=$clusterResourceId,omsagent.env.clusterRegion=$clusterRegion $helmChartRepoPath --kube-context ${kubeconfigContext} fi fi @@ -560,8 +584,8 @@ enable_aks_monitoring_addon() { # parse and validate args parse_args $@ -# configure azure cli for public cloud -configure_to_public_cloud +# validate and configure azure cli for cloud +validate_and_configure_supported_cloud # parse cluster resource id clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 11ecf6819..847bf84ea 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -281,11 +281,26 @@ set_azure_subscription() { echo "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" } +validate_and_configure_supported_cloud() { + echo "get active azure cloud name configured to azure cli" + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]") + echo "active azure cloud name configured to azure cli: ${azureCloudName}" + if [ "$isArcK8sCluster" = true ]; then + if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then + echo "-e only supported clouds are AzureCloud and AzureUSGovernment for Azure Arc enabled Kubernetes cluster type" + exit 1 + fi + else + # For ARO v4, only supported cloud is public so just configure to public to keep the existing behavior + configure_to_public_cloud + fi +} + # parse and validate args parse_args $@ -# configure azure cli for public cloud -configure_to_public_cloud +# configure azure cli for cloud +validate_and_configure_supported_cloud # parse cluster resource id clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" From ef9d726c7053fba0254fc897aff124e5a5a2be34 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 4 Jan 2021 10:43:44 -0800 Subject: [PATCH 051/194] Convert PV type dictionary to json for telemetry so it shows up in logs (#480) --- source/plugins/ruby/in_kube_pvinventory.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index b0e09c85b..861b3a8e1 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -90,7 +90,7 @@ def enumerate # Flush AppInsights telemetry once all the processing is done if telemetryFlush == true telemetryProperties = {} - telemetryProperties["CountsOfPVTypes"] = @pvTypeToCountHash + telemetryProperties["CountsOfPVTypes"] = @pvTypeToCountHash.to_json ApplicationInsightsUtility.sendCustomEvent(Constants::PV_INVENTORY_HEART_BEAT_EVENT, telemetryProperties) @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i end From 97bdb94ad95234202ec2eca172cf419b5cee82d5 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 6 Jan 2021 09:59:49 -0800 Subject: [PATCH 052/194] fix 2 windows tasks - 1) Dont log to termination log 2) enable ADX route for containerlogs in windows (for O365) (#482) --- build/common/installer/scripts/tomlparser.rb | 2 +- .../installer/scripts/livenessprobe.cmd | 24 +++++++------------ 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 7235ee0c3..1d33da124 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -244,7 +244,7 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) - commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) + commands = get_command_windows('AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE', @containerLogsRoute) file.write(commands) # Close file after writing all environment variables diff --git a/build/windows/installer/scripts/livenessprobe.cmd b/build/windows/installer/scripts/livenessprobe.cmd index 06d577f31..19d0b69d7 100644 --- a/build/windows/installer/scripts/livenessprobe.cmd +++ b/build/windows/installer/scripts/livenessprobe.cmd @@ -1,40 +1,32 @@ -echo "Checking if fluent-bit is running" +REM "Checking if fluent-bit is running" tasklist /fi "imagename eq fluent-bit.exe" /fo "table" | findstr fluent-bit IF ERRORLEVEL 1 ( - echo "Fluent-Bit is not running" > /dev/termination-log + echo "Fluent-Bit is not running" exit /b 1 -) ELSE ( - echo "Fluent-Bit is running" ) -echo "Checking if config map has been updated since agent start" +REM "Checking if config map has been updated since agent start" IF EXIST C:\etc\omsagentwindows\filesystemwatcher.txt ( - echo "Config Map Updated since agent started" > /dev/termination-log + echo "Config Map Updated since agent started" exit /b 1 -) ELSE ( - echo "Config Map not Updated since agent start" ) -echo "Checking if certificate needs to be renewed (aka agent restart required)" +REM "Checking if certificate needs to be renewed (aka agent restart required)" IF EXIST C:\etc\omsagentwindows\renewcertificate.txt ( - echo "Certificate needs to be renewed" > /dev/termination-log + echo "Certificate needs to be renewed" exit /b 1 -) ELSE ( - echo "Certificate does NOT need to be renewd" ) -echo "Checking if fluentd service is running" +REM "Checking if fluentd service is running" sc query fluentdwinaks | findstr /i STATE | findstr RUNNING IF ERRORLEVEL 1 ( - echo "Fluentd Service is NOT Running" > /dev/termination-log + echo "Fluentd Service is NOT Running" exit /b 1 -) ELSE ( - echo "Fluentd Service is Running" ) exit /b 0 From 94237beba5671904945a676d156c609118c0b2d7 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 6 Jan 2021 13:58:22 -0800 Subject: [PATCH 053/194] fix ci envvar collection in large pods (#483) --- .../ruby/kubernetes_container_inventory.rb | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index 4fe728579..ba6a9af42 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -193,25 +193,41 @@ def obtainContainerEnvironmentVars(containerId) $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars @ #{Time.now.utc.iso8601}") envValueString = "" begin - unless @@containerCGroupCache.has_key?(containerId) + isCGroupPidFetchRequired = false + if !@@containerCGroupCache.has_key?(containerId) + isCGroupPidFetchRequired = true + else + cGroupPid = @@containerCGroupCache[containerId] + if cGroupPid.nil? || cGroupPid.empty? + isCGroupPidFetchRequired = true + @@containerCGroupCache.delete(containerId) + elsif !File.exist?("/hostfs/proc/#{cGroupPid}/environ") + isCGroupPidFetchRequired = true + @@containerCGroupCache.delete(containerId) + end + end + + if isCGroupPidFetchRequired $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars fetching cGroup parent pid @ #{Time.now.utc.iso8601} for containerId: #{containerId}") Dir["/hostfs/proc/*/cgroup"].each do |filename| begin - if File.file?(filename) && File.foreach(filename).grep(/#{containerId}/).any? + if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any? # file full path is /hostfs/proc//cgroup - cGroupPid = filename.split("/")[3] - if @@containerCGroupCache.has_key?(containerId) - tempCGroupPid = @@containerCGroupCache[containerId] - if tempCGroupPid > cGroupPid + cGroupPid = filename.split("/")[3] + if is_number?(cGroupPid) + if @@containerCGroupCache.has_key?(containerId) + tempCGroupPid = @@containerCGroupCache[containerId] + if tempCGroupPid.to_i > cGroupPid.to_i + @@containerCGroupCache[containerId] = cGroupPid + end + else @@containerCGroupCache[containerId] = cGroupPid - end - else - @@containerCGroupCache[containerId] = cGroupPid + end end end - rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read - end - end + rescue SystemCallError # ignore Error::ENOENT,Errno::ESRCH which is expected if any of the container gone while we read + end + end end cGroupPid = @@containerCGroupCache[containerId] if !cGroupPid.nil? && !cGroupPid.empty? @@ -341,5 +357,8 @@ def deleteCGroupCacheEntryForDeletedContainer(containerId) ApplicationInsightsUtility.sendExceptionTelemetry(error) end end + def is_number?(value) + true if Integer(value) rescue false + end end end From aacd496eeba6350ec0d028334813df7edc806a5e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 7 Jan 2021 13:39:17 -0800 Subject: [PATCH 054/194] grwehner/jan agent tasks (#481) - Windows agent fix to use log filtering settings in config map. - Error handling for kubelet_utils get_node_capacity in case /metrics/cadvsior endpoint fails. - Remove env variable for workspace key for windows agent --- build/common/installer/scripts/tomlparser.rb | 2 +- .../installer/certificategenerator/Program.cs | 8 +++----- build/windows/installer/conf/fluent.conf | 12 ++++++++++-- kubernetes/windows/main.ps1 | 12 ++---------- source/plugins/ruby/filter_cadvisor2mdm.rb | 12 ++++++++++-- 5 files changed, 26 insertions(+), 20 deletions(-) diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 1d33da124..fe26f639e 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -228,7 +228,7 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_LOG_TAIL_PATH', @logTailPath) file.write(commands) - commands = get_command_windows('AZMON_LOG_EXCLUSION_REGEX_PATTERN', @stdoutExcludeNamespaces) + commands = get_command_windows('AZMON_LOG_EXCLUSION_REGEX_PATTERN', @logExclusionRegexPattern) file.write(commands) commands = get_command_windows('AZMON_STDOUT_EXCLUDED_NAMESPACES', @stdoutExcludeNamespaces) file.write(commands) diff --git a/build/windows/installer/certificategenerator/Program.cs b/build/windows/installer/certificategenerator/Program.cs index 43063c4be..e24d0e303 100644 --- a/build/windows/installer/certificategenerator/Program.cs +++ b/build/windows/installer/certificategenerator/Program.cs @@ -414,14 +414,12 @@ static void Main(string[] args) try { - if (!String.IsNullOrEmpty(Environment.GetEnvironmentVariable("WSKEY"))) - { - logAnalyticsWorkspaceSharedKey = Environment.GetEnvironmentVariable("WSKEY"); - } + // WSKEY isn't stored as an environment variable + logAnalyticsWorkspaceSharedKey = File.ReadAllText("C:/etc/omsagent-secret/KEY").Trim(); } catch (Exception ex) { - Console.WriteLine("Failed to read env variables (WSKEY)" + ex.Message); + Console.WriteLine("Failed to read secret (WSKEY)" + ex.Message); } try diff --git a/build/windows/installer/conf/fluent.conf b/build/windows/installer/conf/fluent.conf index c96300b1e..d5eb475ca 100644 --- a/build/windows/installer/conf/fluent.conf +++ b/build/windows/installer/conf/fluent.conf @@ -6,7 +6,8 @@ @type tail - path /var/log/containers/*.log + path "#{ENV['AZMON_LOG_TAIL_PATH']}" + exclude_path "#{ENV['AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH']}" pos_file /var/opt/microsoft/fluent/fluentd-containers.log.pos tag oms.container.log.la @log_level trace @@ -28,6 +29,14 @@ @include fluent-docker-parser.conf + + @type grep + + key stream + pattern "#{ENV['AZMON_LOG_EXCLUSION_REGEX_PATTERN']}" + + + @type record_transformer # fluent-plugin-record-modifier more light-weight but needs to be installed (dependency worth it?) @@ -37,7 +46,6 @@ - @type forward send_timeout 60s diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index d32e5068a..a297e3801 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -64,19 +64,11 @@ function Set-EnvironmentVariables { $wsID = Get-Content /etc/omsagent-secret/WSID } - # Set DOMAIN + # Set WSID [System.Environment]::SetEnvironmentVariable("WSID", $wsID, "Process") [System.Environment]::SetEnvironmentVariable("WSID", $wsID, "Machine") - $wsKey = "" - if (Test-Path /etc/omsagent-secret/KEY) { - # TODO: Change to omsagent-secret before merging - $wsKey = Get-Content /etc/omsagent-secret/KEY - } - - # Set KEY - [System.Environment]::SetEnvironmentVariable("WSKEY", $wsKey, "Process") - [System.Environment]::SetEnvironmentVariable("WSKEY", $wsKey, "Machine") + # Don't store WSKEY as environment variable $proxy = "" if (Test-Path /etc/omsagent-secret/PROXY) { diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 2423ad024..8d7e729c8 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -308,8 +308,16 @@ def ensure_cpu_memory_capacity_set end elsif controller_type.downcase == "daemonset" capacity_from_kubelet = KubeletUtils.get_node_capacity - @cpu_capacity = capacity_from_kubelet[0] - @memory_capacity = capacity_from_kubelet[1] + + # Error handling in case /metrics/cadvsior endpoint fails + if !capacity_from_kubelet.nil? && capacity_from_kubelet.length > 1 + @cpu_capacity = capacity_from_kubelet[0] + @memory_capacity = capacity_from_kubelet[1] + else + # cpu_capacity and memory_capacity keep initialized value of 0.0 + @log.error "Error getting capacity_from_kubelet: cpu_capacity and memory_capacity" + end + end end From 148d73974a003aba7f77f93389c59aede4679b49 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 7 Jan 2021 18:38:06 -0800 Subject: [PATCH 055/194] updating fbit version and cpu limit (#485) --- kubernetes/linux/setup.sh | 2 +- kubernetes/omsagent.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index fb41d4782..88e9da4dd 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -71,7 +71,7 @@ chmod 777 /opt/telegraf wget -qO - https://packages.fluentbit.io/fluentbit.key | sudo apt-key add - sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/apt/sources.list sudo apt-get update -sudo apt-get install td-agent-bit=1.4.2 -y +sudo apt-get install td-agent-bit=1.6.9 -y rm -rf $TMPDIR/omsbundle rm -f $TMPDIR/omsagent*.sh diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 013e2a6c0..563955968 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -368,7 +368,7 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 250m + cpu: 500m memory: 600Mi requests: cpu: 75m From bd33dd9f23cfc5c569e83d9389b2d0064757f5be Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 8 Jan 2021 13:47:25 -0800 Subject: [PATCH 056/194] reverting to older version (#487) --- kubernetes/linux/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 88e9da4dd..352be06d7 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -71,7 +71,7 @@ chmod 777 /opt/telegraf wget -qO - https://packages.fluentbit.io/fluentbit.key | sudo apt-key add - sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/apt/sources.list sudo apt-get update -sudo apt-get install td-agent-bit=1.6.9 -y +sudo apt-get install td-agent-bit=1.6.8 -y rm -rf $TMPDIR/omsbundle rm -f $TMPDIR/omsagent*.sh From d5164d235dd2512824f679ddbe30ebafdf8f1a14 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 11 Jan 2021 11:48:50 -0800 Subject: [PATCH 057/194] Gangams/add fbsettings configurable via configmap (#486) * wip * fbit config settings * add config warn message * handle one config provided but not other * fixed pr feedback * fix copy paste error * rename config parameter names * fix typo * fix fbit crash in helm path * fix nil check --- .../scripts/td-agent-bit-conf-customizer.rb | 11 +++-- .../scripts/tomlparser-agent-config.rb | 48 +++++++++++++++++++ kubernetes/linux/main.sh | 1 + 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb index fae3acb36..35b71e550 100644 --- a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb @@ -18,12 +18,17 @@ def substituteFluentBitPlaceHolders bufferChunkSize = ENV["FBIT_TAIL_BUFFER_CHUNK_SIZE"] bufferMaxSize = ENV["FBIT_TAIL_BUFFER_MAX_SIZE"] - serviceInterval = (!interval.nil? && is_number?(interval)) ? interval : @default_service_interval + serviceInterval = (!interval.nil? && is_number?(interval) && interval.to_i > 0 ) ? interval : @default_service_interval serviceIntervalSetting = "Flush " + serviceInterval - tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize)) ? bufferChunkSize : nil + tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize) && bufferChunkSize.to_i > 0) ? bufferChunkSize : nil - tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize)) ? bufferMaxSize : nil + tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize) && bufferMaxSize.to_i > 0) ? bufferMaxSize : nil + + if ((!tailBufferChunkSize.nil? && tailBufferMaxSize.nil?) || (!tailBufferChunkSize.nil? && !tailBufferMaxSize.nil? && tailBufferChunkSize.to_i > tailBufferMaxSize.to_i)) + puts "config:warn buffer max size must be greater or equal to chunk size" + tailBufferMaxSize = tailBufferChunkSize + end text = File.read(@td_agent_bit_conf_path) new_contents = text.gsub("${SERVICE_FLUSH_INTERVAL}", serviceIntervalSetting) diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb index 87c5194ed..e587909e5 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -55,6 +55,12 @@ @podsEmitStreamBatchSizeMin = 50 @nodesEmitStreamBatchSizeMin = 50 +# configmap settings related fbit config +@fbitFlushIntervalSecs = 0 +@fbitTailBufferChunkSizeMBs = 0 +@fbitTailBufferMaxSizeMBs = 0 + + def is_number?(value) true if Integer(value) rescue false end @@ -131,6 +137,38 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "Using config map value: NODES_EMIT_STREAM_BATCH_SIZE = #{@nodesEmitStreamBatchSize}" end end + # fbit config settings + fbit_config = parsedConfig[:agent_settings][:fbit_config] + if !fbit_config.nil? + fbitFlushIntervalSecs = fbit_config[:log_flush_interval_secs] + if !fbitFlushIntervalSecs.nil? && is_number?(fbitFlushIntervalSecs) && fbitFlushIntervalSecs.to_i > 0 + @fbitFlushIntervalSecs = fbitFlushIntervalSecs.to_i + puts "Using config map value: log_flush_interval_secs = #{@fbitFlushIntervalSecs}" + end + + fbitTailBufferChunkSizeMBs = fbit_config[:tail_buf_chunksize_megabytes] + if !fbitTailBufferChunkSizeMBs.nil? && is_number?(fbitTailBufferChunkSizeMBs) && fbitTailBufferChunkSizeMBs.to_i > 0 + @fbitTailBufferChunkSizeMBs = fbitTailBufferChunkSizeMBs.to_i + puts "Using config map value: tail_buf_chunksize_megabytes = #{@fbitTailBufferChunkSizeMBs}" + end + + fbitTailBufferMaxSizeMBs = fbit_config[:tail_buf_maxsize_megabytes] + if !fbitTailBufferMaxSizeMBs.nil? && is_number?(fbitTailBufferMaxSizeMBs) && fbitTailBufferMaxSizeMBs.to_i > 0 + if fbitTailBufferMaxSizeMBs.to_i >= @fbitTailBufferChunkSizeMBs + @fbitTailBufferMaxSizeMBs = fbitTailBufferMaxSizeMBs.to_i + puts "Using config map value: tail_buf_maxsize_megabytes = #{@fbitTailBufferMaxSizeMBs}" + else + # tail_buf_maxsize_megabytes has to be greater or equal to tail_buf_chunksize_megabytes + @fbitTailBufferMaxSizeMBs = @fbitTailBufferChunkSizeMBs + puts "config::warn: tail_buf_maxsize_megabytes must be greater or equal to value of tail_buf_chunksize_megabytes. Using tail_buf_maxsize_megabytes = #{@fbitTailBufferMaxSizeMBs} since provided config value not valid" + end + end + # in scenario - tail_buf_chunksize_megabytes provided but not tail_buf_maxsize_megabytes to prevent fbit crash + if @fbitTailBufferChunkSizeMBs > 0 && @fbitTailBufferMaxSizeMBs == 0 + @fbitTailBufferMaxSizeMBs = @fbitTailBufferChunkSizeMBs + puts "config::warn: since tail_buf_maxsize_megabytes not provided hence using tail_buf_maxsize_megabytes=#{@fbitTailBufferMaxSizeMBs} which is same as the value of tail_buf_chunksize_megabytes" + end + end end rescue => errorStr puts "config::error:Exception while reading config settings for agent configuration setting - #{errorStr}, using defaults" @@ -164,6 +202,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export HPA_CHUNK_SIZE=#{@hpaChunkSize}\n") file.write("export PODS_EMIT_STREAM_BATCH_SIZE=#{@podsEmitStreamBatchSize}\n") file.write("export NODES_EMIT_STREAM_BATCH_SIZE=#{@nodesEmitStreamBatchSize}\n") + # fbit settings + if @fbitFlushIntervalSecs > 0 + file.write("export FBIT_SERVICE_FLUSH_INTERVAL=#{@fbitFlushIntervalSecs}\n") + end + if @fbitTailBufferChunkSizeMBs > 0 + file.write("export FBIT_TAIL_BUFFER_CHUNK_SIZE=#{@fbitTailBufferChunkSizeMBs}\n") + end + if @fbitTailBufferMaxSizeMBs > 0 + file.write("export FBIT_TAIL_BUFFER_MAX_SIZE=#{@fbitTailBufferMaxSizeMBs}\n") + end # Close file after writing all environment variables file.close else diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index ed16d3e32..b4df538d4 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -172,6 +172,7 @@ source config_env_var #Parse the configmap to set the right environment variables for agent config. +#Note > tomlparser-agent-config.rb has to be parsed first before td-agent-bit-conf-customizer.rb for fbit agent settings /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb cat agent_config_env_var | while read line; do From 908d9b0cdcd46452582338ca23f7bfbf85411e37 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 11 Jan 2021 12:47:38 -0800 Subject: [PATCH 058/194] Gangams/jan agent release tasks (#484) * wip * explicit amd64 affinity for hybrid workloads * fix space issue * wip * revert vscode setting file --- .../templates/omsagent-daemonset-windows.yaml | 4 ++++ .../templates/omsagent-daemonset.yaml | 4 ++++ charts/azuremonitor-containers/values.yaml | 18 +++++++++++++++++- kubernetes/omsagent.yaml | 8 ++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 6a309c121..81003c704 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -27,6 +27,10 @@ spec: checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} spec: + dnsConfig: + options: + - name: ndots + value: "3" {{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} nodeSelector: kubernetes.io/os: windows diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index d57c4d82b..3d29ede42 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -28,6 +28,10 @@ spec: checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: + dnsConfig: + options: + - name: ndots + value: "3" {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 907e315d1..b3d029146 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -58,7 +58,7 @@ omsagent: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: - matchExpressions: + matchExpressions: - key: kubernetes.io/os operator: In values: @@ -67,6 +67,10 @@ omsagent: operator: NotIn values: - virtual-kubelet + - key: kubernetes.io/arch + operator: In + values: + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -78,6 +82,10 @@ omsagent: operator: NotIn values: - virtual-kubelet + - key: beta.kubernetes.io/arch + operator: In + values: + - amd64 deployment: affinity: nodeAffinity: @@ -106,6 +114,10 @@ omsagent: operator: NotIn values: - master + - key: kubernetes.io/arch + operator: In + values: + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -121,6 +133,10 @@ omsagent: operator: NotIn values: - master + - key: beta.kubernetes.io/arch + operator: In + values: + - amd64 ## Configure resource requests and limits ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 563955968..df80cabc4 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -362,6 +362,10 @@ spec: schema-versions: "v1" spec: serviceAccountName: omsagent + dnsConfig: + options: + - name: ndots + value: "3" containers: - name: omsagent image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" @@ -675,6 +679,10 @@ spec: schema-versions: "v1" spec: serviceAccountName: omsagent + dnsConfig: + options: + - name: ndots + value: "3" containers: - name: omsagent-win image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020" From 8ede53653f79a7401352739f6d4f09e572b12235 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 11 Jan 2021 13:18:14 -0800 Subject: [PATCH 059/194] remove per container logs in ci (#488) --- .../plugins/ruby/kubernetes_container_inventory.rb | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index ba6a9af42..69beca493 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -189,8 +189,7 @@ def getContainersInfoMap(podItem, isWindows) return containersInfoMap end - def obtainContainerEnvironmentVars(containerId) - $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars @ #{Time.now.utc.iso8601}") + def obtainContainerEnvironmentVars(containerId) envValueString = "" begin isCGroupPidFetchRequired = false @@ -207,8 +206,7 @@ def obtainContainerEnvironmentVars(containerId) end end - if isCGroupPidFetchRequired - $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars fetching cGroup parent pid @ #{Time.now.utc.iso8601} for containerId: #{containerId}") + if isCGroupPidFetchRequired Dir["/hostfs/proc/*/cgroup"].each do |filename| begin if File.file?(filename) && File.exist?(filename) && File.foreach(filename).grep(/#{containerId}/).any? @@ -231,8 +229,7 @@ def obtainContainerEnvironmentVars(containerId) end cGroupPid = @@containerCGroupCache[containerId] if !cGroupPid.nil? && !cGroupPid.empty? - environFilePath = "/hostfs/proc/#{cGroupPid}/environ" - $log.info("KubernetesContainerInventory::obtainContainerEnvironmentVars cGroupPid: #{cGroupPid} environFilePath: #{environFilePath} for containerId: #{containerId}") + environFilePath = "/hostfs/proc/#{cGroupPid}/environ" if File.exist?(environFilePath) # Skip environment variable processing if it contains the flag AZMON_COLLECT_ENV=FALSE # Check to see if the environment variable collection is disabled for this container. @@ -245,8 +242,7 @@ def obtainContainerEnvironmentVars(containerId) if !envVars.nil? && !envVars.empty? envVars = envVars.split("\0") envValueString = envVars.to_json - envValueStringLength = envValueString.length - $log.info("KubernetesContainerInventory::environment vars filename @ #{environFilePath} envVars size @ #{envValueStringLength}") + envValueStringLength = envValueString.length if envValueStringLength >= 200000 lastIndex = envValueString.rindex("\",") if !lastIndex.nil? From 37e5218e4a6a6f6c02591093356e0dee7f79af7f Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 12 Jan 2021 10:34:31 -0800 Subject: [PATCH 060/194] updates for ciprod01112021 release (#489) --- ReleaseNotes.md | 17 +++++++++++++++++ build/version | 6 +++--- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ kubernetes/windows/Dockerfile | 2 +- .../onboarding/managed/enable-monitoring.ps1 | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- .../onboarding/managed/upgrade-monitoring.sh | 2 +- 10 files changed, 35 insertions(+), 18 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index ddfd01314..b1eb316a1 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -10,6 +10,23 @@ additional questions or comments. ## Release History Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 01/11/2021 - +##### Version microsoft/oms:ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021 (linux) +##### Version microsoft/oms:win-ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021 (windows) +##### Code change log +- Fixes for Linux Agent Replicaset Pod OOMing issue +- Update fluentbit (1.14.2 to 1.6.8) for the Linux Daemonset +- Make Fluentbit settings: log_flush_interval_secs, tail_buf_chunksize_megabytes and tail_buf_maxsize_megabytes configurable via configmap +- Support for PV inventory collection +- Removal of Custom metric region check for Public cloud regions and update to use cloud environment variable to determine the custom metric support +- For daemonset pods, add the dnsconfig to use ndots: 3 from ndots:5 to optimize the number of DNS API calls made +- Fix for inconsistency in the collection container environment variables for the pods which has high number of containers +- Fix for disabling of std{out;err} log_collection_settings via configmap issue in windows daemonset +- Update to use workspace key from mount file rather than environment variable for windows daemonset agent +- Remove per container info logs in the container inventory +- Enable ADX route for windows container logs +- Remove logging to termination log in windows agent liveness probe + ### 11/09/2020 - ##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020 (linux) diff --git a/build/version b/build/version index a8b78ecac..711a96921 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=11 +CONTAINER_BUILDVERSION_MAJOR=12 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 -CONTAINER_BUILDVERSION_BUILDNR=1 -CONTAINER_BUILDVERSION_DATE=20201109 +CONTAINER_BUILDVERSION_BUILDNR=0 +CONTAINER_BUILDVERSION_DATE=20210111 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 987841f77..a809a4e69 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.7.9 +version: 2.8.0 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index b3d029146..debd66b0b 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,10 +12,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod11092020" - tagWindows: "win-ciprod11092020" + tag: "ciprod01112021" + tagWindows: "win-ciprod01112021" pullPolicy: IfNotPresent - dockerProviderVersion: "11.0.0-1" + dockerProviderVersion: "12.0.0-0" agentVersion: "1.10.0.1" ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 34ab133da..2e1118922 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod11092020 +ARG IMAGE_TAG=ciprod01112021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index df80cabc4..67bd9cdde 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "11.0.0-1" + dockerProviderVersion: "12.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021" imagePullPolicy: IfNotPresent resources: limits: @@ -521,13 +521,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "11.0.0-1" + dockerProviderVersion: "12.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021" imagePullPolicy: IfNotPresent resources: limits: @@ -675,7 +675,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "11.0.0-1" + dockerProviderVersion: "12.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -685,7 +685,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 10ea235b2..f852bd236 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod11092020 +ARG IMAGE_TAG=win-ciprod01112021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 7b128b112..45ddb44b0 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -64,7 +64,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.7.9" +$mcrChartVersion = "2.8.0" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." $omsAgentDomainName="opinsights.azure.com" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 85428aff7..2dc0a465f 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -44,7 +44,7 @@ defaultAzureCloud="AzureCloud" omsAgentDomainName="opinsights.azure.com" # released chart version in mcr -mcrChartVersion="2.7.9" +mcrChartVersion="2.8.0" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 847bf84ea..8826b6df6 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.7.9" +mcrChartVersion="2.8.0" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" From 3c97af6ac0613045df20f59b126f5aec94855e1f Mon Sep 17 00:00:00 2001 From: deagraw Date: Thu, 14 Jan 2021 10:47:48 -0800 Subject: [PATCH 061/194] new yaml files (#491) --- .../clusteruser/cluster-user-role-binding.yaml | 12 ++++++++++++ .../onboarding/clusteruser/cluster-user-role.yaml | 14 ++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 scripts/onboarding/clusteruser/cluster-user-role-binding.yaml create mode 100644 scripts/onboarding/clusteruser/cluster-user-role.yaml diff --git a/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml b/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml new file mode 100644 index 000000000..fce2fc582 --- /dev/null +++ b/scripts/onboarding/clusteruser/cluster-user-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: containerHealth-read-logs-global +roleRef: + kind: ClusterRole + name: containerHealth-log-reader + apiGroup: rbac.authorization.k8s.io +subjects: + - kind: User + name: clusterUser + apiGroup: rbac.authorization.k8s.io diff --git a/scripts/onboarding/clusteruser/cluster-user-role.yaml b/scripts/onboarding/clusteruser/cluster-user-role.yaml new file mode 100644 index 000000000..b3519fdd3 --- /dev/null +++ b/scripts/onboarding/clusteruser/cluster-user-role.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: containerHealth-log-reader +rules: + - apiGroups: ["", "metrics.k8s.io", "extensions", "apps"] + resources: + - "pods/log" + - "events" + - "nodes" + - "pods" + - "deployments" + - "replicasets" + verbs: ["get", "list"] From 90e1a5be8928305cd2378c1922924efac8cafc80 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Thu, 21 Jan 2021 18:48:14 -0800 Subject: [PATCH 062/194] Use cloud-specific instrumentation keys (#494) If APPLICATIONINSIGHTS_AUTH_URL is set/non-empty then the agent will now grab a custom IKey from a URL stored in APPLICATIONINSIGHTS_AUTH_URL --- .../build-and-publish-docker-image.sh | 0 kubernetes/linux/main.sh | 38 ++++++++++++-- kubernetes/windows/main.ps1 | 50 ++++++++++++++++--- 3 files changed, 75 insertions(+), 13 deletions(-) mode change 100644 => 100755 kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh diff --git a/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh b/kubernetes/linux/dockerbuild/build-and-publish-docker-image.sh old mode 100644 new mode 100755 diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index b4df538d4..c4067f25e 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -161,6 +161,39 @@ fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc +# Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) +if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) + for BACKOFF in {1..4}; do + KEY=$(curl -sS $APPLICATIONINSIGHTS_AUTH_URL ) + # there's no easy way to get the HTTP status code from curl, so just check if the result is well formatted + if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then + break + else + sleep $((2**$BACKOFF / 4)) # (exponential backoff) + fi + done + + # validate that the retrieved data is an instrumentation key + if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then + export APPLICATIONINSIGHTS_AUTH=$(echo $KEY) + echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc + echo "Using cloud-specific instrumentation key" + else + # no ikey can be retrieved. Disable telemetry and continue + export DISABLE_TELEMETRY=true + echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc + echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" + fi +fi + + +aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) +export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc + +source ~/.bashrc + + #Parse the configmap to set the right environment variables. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb @@ -581,11 +614,6 @@ echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc export HOST_VAR=/hostfs/var echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc -aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc - -source ~/.bashrc #start telegraf /opt/telegraf --config $telegrafConfFile & diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index a297e3801..722392157 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -119,10 +119,48 @@ function Set-EnvironmentVariables { $env:AZMON_AGENT_CFG_SCHEMA_VERSION } - # Set environment variable for TELEMETRY_APPLICATIONINSIGHTS_KEY - $aiKey = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($env:APPLICATIONINSIGHTS_AUTH)) - [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKey, "Process") - [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKey, "Machine") + # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) + $aiKeyURl = [System.Environment]::GetEnvironmentVariable('APPLICATIONINSIGHTS_AUTH_URL') + if ($aiKeyURl) { + $aiKeyFetched = "" + # retry up to 5 times + for( $i = 1; $i -le 4; $i++) { + try { + $response = Invoke-WebRequest -uri $aiKeyURl -UseBasicParsing -TimeoutSec 5 -ErrorAction:Stop + + if ($response.StatusCode -ne 200) { + Write-Host "Expecting reponse code 200, was: $($response.StatusCode), retrying" + Start-Sleep -Seconds ([MATH]::Pow(2, $i) / 4) + } + else { + $aiKeyFetched = $response.Content + break + } + } + catch { + Write-Host "Exception encountered fetching instrumentation key:" + Write-Host $_.Exception + } + } + + # Check if the fetched IKey was properly encoded. if not then turn off telemetry + if ($aiKeyFetched -match '^[A-Za-z0-9=]+$') { + Write-Host "Using cloud-specific instrumentation key" + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", $aiKeyFetched, "Process") + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", $aiKeyFetched, "Machine") + } + else { + # Couldn't fetch the Ikey, turn telemetry off + Write-Host "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" + [System.Environment]::SetEnvironmentVariable("DISABLE_TELEMETRY", "True", "Process") + [System.Environment]::SetEnvironmentVariable("DISABLE_TELEMETRY", "True", "Machine") + } + } + + $aiKeyDecoded = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($env:APPLICATIONINSIGHTS_AUTH)) + [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Process") + [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Machine") + # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb @@ -324,7 +362,3 @@ Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | For #check if fluentd service is running Get-Service fluentdwinaks - - - - From 98b6d779d29d4bbc56657e0403ef03e4498028e3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 21 Jan 2021 23:27:07 -0800 Subject: [PATCH 063/194] upgrade apt to latest version (#492) * upgrade apt to latest version * fix pr feedback --- kubernetes/linux/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 352be06d7..fe6c0565a 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -2,8 +2,8 @@ TMPDIR="/opt" cd $TMPDIR #Download utf-8 encoding capability on the omsagent container. - -apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales +#upgrade apt to latest version +apt-get update && apt-get install -y apt && DEBIAN_FRONTEND=noninteractive apt-get install -y locales sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ From ddcd3eec1037471abecc9b13b0807e520d7fbeff Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 27 Jan 2021 13:20:27 -0800 Subject: [PATCH 064/194] Gangams/add support for extension msi for arc k8s cluster (#495) * wip * add env var for the arc k8s extension name * chart update * extension msi updates * fix bug * revert chart and image to prod version * minor text changes * image tag to prod * wip * wip * wip * wip * final updates * fix whitespaces * simplify crd yaml --- .../templates/omsagent-arc-k8s-crd.yaml | 17 +++++++++++++++++ .../templates/omsagent-daemonset.yaml | 6 +++++- .../templates/omsagent-deployment.yaml | 6 +++++- .../templates/omsagent-rbac.yaml | 4 ++++ charts/azuremonitor-containers/values.yaml | 7 +++++-- source/plugins/ruby/arc_k8s_cluster_identity.rb | 11 ++++++++--- 6 files changed, 44 insertions(+), 7 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml index ebdd5ea3f..b7482b8b5 100644 --- a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml @@ -1,4 +1,18 @@ {{- if or ( contains "microsoft.kubernetes/connectedclusters" (.Values.Azure.Cluster.ResourceId | lower) ) ( contains "microsoft.kubernetes/connectedclusters" (.Values.omsagent.env.clusterId | lower)) }} +#extension model +{{- if not (empty .Values.Azure.Extension.Name) }} +apiVersion: clusterconfig.azure.com/v1beta1 +kind: AzureExtensionIdentity +metadata: + name: {{ .Values.Azure.Extension.Name }} + namespace: azure-arc +spec: + serviceAccounts: + - name: omsagent + namespace: kube-system + tokenNamespace: azure-arc +--- +{{- end }} apiVersion: clusterconfig.azure.com/v1beta1 kind: AzureClusterIdentityRequest metadata: @@ -6,4 +20,7 @@ metadata: namespace: azure-arc spec: audience: https://monitoring.azure.com/ + {{- if not (empty .Values.Azure.Extension.Name) }} + resourceId: {{ .Values.Azure.Extension.Name }} + {{- end }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 3d29ede42..595edd7bb 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -70,8 +70,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + {{- if not (empty .Values.Azure.Extension.Name) }} + - name: ARC_K8S_EXTENSION_NAME + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" {{- if .Values.omsagent.logsettings.logflushintervalsecs }} - name: FBIT_SERVICE_FLUSH_INTERVAL value: {{ .Values.omsagent.logsettings.logflushintervalsecs | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 8609d25c9..ecd0b705b 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -67,8 +67,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + {{- if not (empty .Values.Azure.Extension.Name) }} + - name: ARC_K8S_EXTENSION_NAME + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" securityContext: privileged: true ports: diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index bd4e9baf3..5db5c2dab 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -33,10 +33,14 @@ rules: verbs: ["get", "create", "patch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] +#arc k8s extension model grants access as part of the extension msi +#remove this explicit permission once the extension available in public preview +{{- if (empty .Values.Azure.Extension.Name) }} - apiGroups: [""] resources: ["secrets"] resourceNames: ["container-insights-clusteridentityrequest-token"] verbs: ["get"] +{{- end }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index debd66b0b..341b9fb65 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -4,11 +4,14 @@ ## Microsoft OMS Agent image for kubernetes cluster monitoring ## ref: https://github.com/microsoft/Docker-Provider/tree/ci_prod -## Values of ResourceId and Region under Azure->Cluster being populated by Azure Arc K8s RP during the installation of the extension +## Values of under Azure are being populated by Azure Arc K8s RP during the installation of the extension Azure: Cluster: Region: - ResourceId: + ResourceId: + Extension: + Name: "" + ResourceId: "" omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb index ef55c3257..7824f3d4e 100644 --- a/source/plugins/ruby/arc_k8s_cluster_identity.rb +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -18,7 +18,7 @@ class ArcK8sClusterIdentity @@crd_resource_uri_template = "%{kube_api_server_url}/apis/%{cluster_config_crd_api_version}/namespaces/%{cluster_identity_resource_namespace}/azureclusteridentityrequests/%{cluster_identity_resource_name}" @@secret_resource_uri_template = "%{kube_api_server_url}/api/v1/namespaces/%{cluster_identity_token_secret_namespace}/secrets/%{token_secret_name}" @@azure_monitor_custom_metrics_audience = "https://monitoring.azure.com/" - @@cluster_identity_request_kind = "AzureClusterIdentityRequest" + @@cluster_identity_request_kind = "AzureClusterIdentityRequest" def initialize @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" @@ -33,7 +33,9 @@ def initialize @log.warn "got api server url nil from KubernetesApiClient.getKubeAPIServerUrl @ #{Time.now.utc.iso8601}" end @http_client = get_http_client - @service_account_token = get_service_account_token + @service_account_token = get_service_account_token + @extensionName = ENV["ARC_K8S_EXTENSION_NAME"] + @log.info "extension name:#{@extensionName} @ #{Time.now.utc.iso8601}" @log.info "initialize complete @ #{Time.now.utc.iso8601}" end @@ -148,7 +150,7 @@ def renew_near_expiry_token() update_response = @http_client.request(update_request) @log.info "Got response of #{update_response.code} for PATCH #{crd_request_uri} @ #{Time.now.utc.iso8601}" if update_response.code.to_i == 404 - @log.info "since crd resource doesnt exist since creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" + @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" create_request = Net::HTTP::Post.new(crd_request_uri) create_request["Content-Type"] = "application/json" create_request["Authorization"] = "Bearer #{@service_account_token}" @@ -211,6 +213,9 @@ def get_crd_request_body body["metadata"]["namespace"] = @@cluster_identity_resource_namespace body["spec"] = {} body["spec"]["audience"] = @@azure_monitor_custom_metrics_audience + if !@extensionName.nil? && !@extensionName.empty? + body["spec"]["resourceId"] = @extensionName + end return body end end From 0cd99e41b251254ce23e32c86ab28f06ea2c34d3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 27 Jan 2021 13:35:11 -0800 Subject: [PATCH 065/194] Gangams/arm template arc k8s extension (#496) * arm templates for arc k8s extension * update to use official extension type name * update * add identity property * add proxyendpointurl parameter * add default values --- .../existingClusterOnboarding.json | 135 ++++++++++++++++++ .../existingClusterParam.json | 24 ++++ 2 files changed, 159 insertions(+) create mode 100644 scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json create mode 100644 scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json new file mode 100644 index 000000000..8ebef232a --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -0,0 +1,135 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Resource Id of the Azure Arc Connected Cluster" + } + }, + "clusterRegion": { + "type": "string", + "metadata": { + "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" + } + }, + "proxyEndpointUrl": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "If the cluster behind forward proxy, then specify Proxy Endpoint URL in this format: http(s)://:@:" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Resource ID" + } + }, + "workspaceRegion": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace region e.g. \"eastus\"" + } + }, + "workspaceDomain": { + "type": "string", + "allowedValues": [ + "opinsights.azure.com", + "opinsights.azure.cn", + "opinsights.azure.us", + "opinsights.azure.eaglex.ic.gov", + "opinsights.azure.microsoft.scloud" + ], + "defaultValue": "opinsights.azure.com", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace Domain e.g. opinsights.azure.com" + } + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[split(parameters('workspaceResourceId'),'/')[2]]", + "resourceGroup": "[split(parameters('workspaceResourceId'),'/')[4]]", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "apiVersion": "2015-11-01-preview", + "type": "Microsoft.OperationsManagement/solutions", + "location": "[parameters('workspaceRegion')]", + "name": "[Concat('ContainerInsights', '(', split(parameters('workspaceResourceId'),'/')[8], ')')]", + "properties": { + "workspaceResourceId": "[parameters('workspaceResourceId')]" + }, + "plan": { + "name": "[Concat('ContainerInsights', '(', split(parameters('workspaceResourceId'),'/')[8], ')')]", + "product": "[Concat('OMSGallery/', 'ContainerInsights')]", + "promotionCode": "", + "publisher": "Microsoft" + } + } + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('arc-k8s-ci-extension', '-', uniqueString(parameters('clusterResourceId')))]", + "apiVersion": "2019-05-01", + "subscriptionId": "[split(parameters('clusterResourceId'),'/')[2]]", + "resourceGroup": "[split(parameters('clusterResourceId'),'/')[4]]", + "dependsOn": [ + "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.KubernetesConfiguration/extensions", + "apiVersion": "2020-07-01-preview", + "name": "azuremonitor-containers", + "location": "[parameters('clusterRegion')]", + "identity": {"type": "systemassigned"}, + "properties": { + "extensionType": "Microsoft.AzureMonitor.Containers", + "configurationSettings": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]", + "omsagent.domain": "[parameters('workspaceDomain')]" + }, + "configurationProtectedSettings": { + "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" , + "omsagent.proxy": "[if(equals(parameters('proxyEndpointUrl'), ''), '', parameters('proxyEndpointUrl'))]" + }, + "autoUpgradeMinorVersion": true, + "releaseTrain": "Stable", + "scope": { + "Cluster": { + "releaseNamespace": "azuremonitor-containers" + } + } + }, + "scope": "[concat('Microsoft.Kubernetes/connectedClusters/', split(parameters('clusterResourceId'),'/')[8])]" + } + ] + } + } + } + ] +} diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json new file mode 100644 index 000000000..b74b5ac95 --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json @@ -0,0 +1,24 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "value": "/subscriptions//resourceGroups//providers/Microsoft.Kubernetes/connectedClusters/" + }, + "clusterRegion": { + "value": "" + }, + "proxyEndpointUrl": { + "value": "" + }, + "workspaceResourceId": { + "value": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/" + }, + "workspaceRegion": { + "value": "" + }, + "workspaceDomain": { + "value": "" + } + } +} From 13521c5d316eb9e1c147c74b661f67b5873b2d5b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 1 Feb 2021 10:27:40 -0800 Subject: [PATCH 066/194] Gangams/aks monitoring via policy (#497) * enable monitoring through policy * wip * handle tags * wip * add alias * wip * working * updates * working * with deployment name * doc updates * doc updates * fix typo in the docs --- .../azure-policy.json | 113 ++++++++++++++++++ .../azurepolicy.parameters.json | 9 ++ .../azurepolicy.rules.json | 101 ++++++++++++++++ .../enable-monitoring-using-policy.md | 64 ++++++++++ 4 files changed, 287 insertions(+) create mode 100644 scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json create mode 100644 scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json create mode 100644 scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json create mode 100644 scripts/onboarding/enable-monitoring-using-policy.md diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json new file mode 100644 index 000000000..c68bfed17 --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azure-policy.json @@ -0,0 +1,113 @@ +{ + "mode": "Indexed", + "policyRule": { + "if": { + "field": "type", + "equals": "Microsoft.ContainerService/managedClusters" + }, + "then": { + "effect": "deployIfNotExists", + "details": { + "type": "Microsoft.ContainerService/managedClusters", + "name": "[field('name')]", + "roleDefinitionIds": [ + "/providers/Microsoft.Authorization/roleDefinitions/ed7f3fbd-7b88-4dd4-9017-9adb7ce333f8", + "/providers/Microsoft.Authorization/roleDefinitions/92aaf0da-9dab-42b6-94a3-d43ce8d16293" + ], + "existenceCondition": { + "field": "Microsoft.ContainerService/managedClusters/addonProfiles.omsagent.enabled", + "equals": "true" + }, + "deployment": { + "properties": { + "mode": "incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterName": { + "type": "string" + }, + "clusterResourceGroupName": { + "type": "string" + }, + "clusterLocation": { + "type": "string" + }, + "clusterTags": { + "type": "object" + }, + "workspaceResourceId": { + "type": "string" + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-policy', '-', uniqueString(parameters('clusterName')))]", + "apiVersion": "2019-05-01", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[parameters('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('clusterLocation')]", + "tags": "[parameters('clusterTags')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[resourceId(parameters('clusterResourceGroupName'), 'Microsoft.ContainerService/managedClusters', parameters('clusterName'))]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] + } + } + } + ] + }, + "parameters": { + "clusterName": { + "value": "[field('name')]" + }, + "clusterResourceGroupName": { + "value": "[resourceGroup().name]" + }, + "clusterLocation": { + "value": "[field('location')]" + }, + "clusterTags": { + "value": "[field('tags')]" + }, + "workspaceResourceId": { + "value": "[parameters('workspaceResourceId')]" + } + } + } + } + } + } + }, + "parameters": { + "workspaceResourceId": { + "type": "String", + "metadata": { + "displayName": "Resource Id of the existing Azure Log Analytics Workspace", + "description": "Azure Monitor Log Analytics Resource ID" + } + } + } +} diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json new file mode 100644 index 000000000..6281cdade --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.parameters.json @@ -0,0 +1,9 @@ +{ + "workspaceResourceId": { + "type": "string", + "metadata": { + "displayName": "Resource Id of the existing Azure Log Analytics Workspace", + "description": "Azure Monitor Log Analytics Resource ID" + } + } +} diff --git a/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json new file mode 100644 index 000000000..a113441ce --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-azure-policy/azurepolicy.rules.json @@ -0,0 +1,101 @@ +{ + "if": { + "field": "type", + "equals": "Microsoft.ContainerService/managedClusters" + }, + "then": { + "effect": "deployIfNotExists", + "details": { + "type": "Microsoft.ContainerService/managedClusters", + "name": "[field('name')]", + "roleDefinitionIds": [ + "/providers/Microsoft.Authorization/roleDefinitions/ed7f3fbd-7b88-4dd4-9017-9adb7ce333f8", + "/providers/Microsoft.Authorization/roleDefinitions/92aaf0da-9dab-42b6-94a3-d43ce8d16293" + ], + "existenceCondition": { + "field": "Microsoft.ContainerService/managedClusters/addonProfiles.omsagent.enabled", + "equals": "true" + }, + "deployment": { + "properties": { + "mode": "incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterName": { + "type": "string" + }, + "clusterResourceGroupName": { + "type": "string" + }, + "clusterLocation": { + "type": "string" + }, + "clusterTags": { + "type": "object" + }, + "workspaceResourceId": { + "type": "string" + } + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-policy', '-', uniqueString(parameters('clusterName')))]", + "apiVersion": "2019-05-01", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[parameters('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('clusterLocation')]", + "tags": "[parameters('clusterTags')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[resourceId(parameters('clusterResourceGroupName'), 'Microsoft.ContainerService/managedClusters', parameters('clusterName'))]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] + } + } + } + ] + }, + "parameters": { + "clusterName": { + "value": "[field('name')]" + }, + "clusterResourceGroupName": { + "value": "[resourceGroup().name]" + }, + "clusterLocation": { + "value": "[field('location')]" + }, + "clusterTags": { + "value": "[field('tags')]" + }, + "workspaceResourceId": { + "value": "[parameters('workspaceResourceId')]" + } + } + } + } + } + } +} diff --git a/scripts/onboarding/enable-monitoring-using-policy.md b/scripts/onboarding/enable-monitoring-using-policy.md new file mode 100644 index 000000000..e1e395ecc --- /dev/null +++ b/scripts/onboarding/enable-monitoring-using-policy.md @@ -0,0 +1,64 @@ +# How to enable AKS Monitoring Addon via Azure Policy +This doc describes how to enable AKS Monitoring Addon using Azure Custom Policy.Monitoring Addon Custom Policy can be assigned +either at subscription or resource group scope. If Azure Log Analytics workspace and AKS cluster are in different subscriptions then Managed Identity used by Policy assignnment has to have required role permissions on both the subscriptions or least on the resource of the Azure Log Aalytics workspace. Similarly, If the policy scoped to Resource Group, then Managed Identity should have required role permissions on the Log Analytics workspace if the workspace not in the selected Resource Group scope. + +Monitoring Addon require following roles on the Managed Identity used by Azure Policy + - [azure-kubernetes-service-contributor-role](https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#azure-kubernetes-service-contributor-role) + - [log-analytics-contributor](https://docs.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#log-analytics-contributor) + +## Create and Assign Policy definition using Azure Portal + +### Create Policy Definition + +1. Download the Azure Custom Policy definition to enable AKS Monitoring Addon +``` sh + curl -o azurepolicy.json -L https://aka.ms/aks-enable-monitoring-custom-policy +``` +2. Navigate to https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyMenuBlade/Definitions and create policy definition with the following details in the Policy definition create dialogue box + + - Pick any Azure Subscription where you want to store Policy Definition + - Name - '(Preview)AKS-Monitoring-Addon' + - Description - 'Azure Custom Policy to enable Monitoring Addon onto Azure Kubernetes Cluster(s) in specified scope' + - Category - Choose "use existing" and pick 'Kubernetes' from drop down + - Remove the existing sample rules and copy the contents of azurepolicy.json downloaded in step #1 above + +### Assign Policy Definition to Specified Scope + +> Note: Managed Identity will be created automatically and assigned specified roles in the Policy definition. + +3. Navigate to https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyMenuBlade/Definitions and select the Policy Definition 'AKS Monitoring Addon' +4. Click an Assignment and select Scope, Exclusions (if any) +5. Provide the Resource Id of the Azure Log Analytics Workspace. The Resource Id should be in this format `/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/` +6. Create Remediation task in case if you want apply to policy to existing AKS clusters in selected scope +7. Click and Review & Create Option to create Policy Assignment + +## Create and Assign Policy definition using Azure CLI + +### Create Policy Definition + +1. Download the Azure Custom Policy definition rules and parameters files + ``` sh + curl -o azurepolicy.rules.json -L https://aka.ms/aks-enable-monitoring-custom-policy-rules + curl -o azurepolicy.parameters.json -L https://aka.ms/aks-enable-monitoring-custom-policy-parameters + ``` +2. Create policy definition using below command + + ``` sh + az cloud set -n # set the Azure cloud + az login # login to cloud environment + az account set -s + az policy definition create --name "(Preview)AKS-Monitoring-Addon" --display-name "(Preview)AKS-Monitoring-Addon" --mode Indexed --metadata version=1.0.0 category=Kubernetes --rules azurepolicy.rules.json --params azurepolicy.parameters.json + ``` +### Assign Policy Definition to Specified Scope + +3. Create policy assignment + +``` sh +az policy assignment create --name aks-monitoring-addon --policy "(Preview)AKS-Monitoring-Addon" --assign-identity --identity-scope /subscriptions/ --role Contributor --scope /subscriptions/ --location --role Contributor --scope /subscriptions/ -p "{ \"workspaceResourceId\": { \"value\": \"/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/\" } }" +``` + +## References +- https://docs.microsoft.com/en-us/azure/governance/policy/ +- https://docs.microsoft.com/en-us/azure/governance/policy/how-to/remediate-resources#how-remediation-security-works +- https://docs.microsoft.com/en-us/cli/azure/install-azure-cli +- https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview \ No newline at end of file From e4f36c7aef7bce1a0c2270f52f98bf07bf4bfe1c Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 1 Feb 2021 10:27:55 -0800 Subject: [PATCH 067/194] revert to use operatingSystem from osImage for node os telemety (#498) --- source/plugins/ruby/in_kube_nodes.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index e7c5060a5..0a4727077 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -447,7 +447,7 @@ def getNodeTelemetryProps(item) properties["Computer"] = item["metadata"]["name"] nodeInfo = item["status"]["nodeInfo"] properties["KubeletVersion"] = nodeInfo["kubeletVersion"] - properties["OperatingSystem"] = nodeInfo["osImage"] + properties["OperatingSystem"] = nodeInfo["operatingSystem"] properties["KernelVersion"] = nodeInfo["kernelVersion"] properties["OSImage"] = nodeInfo["osImage"] containerRuntimeVersion = nodeInfo["containerRuntimeVersion"] From ec15ac122cc465cfbed5745773c5a0827dcbeed7 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 4 Feb 2021 14:34:46 -0800 Subject: [PATCH 068/194] Container log v2 schema changes (#499) * make pod name in mdsd definition as str for consistency. msgp has no type checking, as it has type metadata in it the message itself. --- build/common/installer/scripts/tomlparser.rb | 14 + kubernetes/linux/mdsd.xml | 67 ++-- source/plugins/go/src/oms.go | 291 ++++++++++++------ source/plugins/go/src/utils.go | 2 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 7 +- 5 files changed, 264 insertions(+), 117 deletions(-) diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index fe26f639e..a0f3c2f0a 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -23,6 +23,7 @@ @logExclusionRegexPattern = "(^((?!stdout|stderr).)*$)" @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false +@containerLogSchemaVersion = "" @collectAllKubeEvents = false @containerLogsRoute = "" @@ -138,6 +139,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for cluster level container log enrichment - #{errorStr}, using defaults, please check config map for errors") end + #Get container log schema version setting + begin + if !parsedConfig[:log_collection_settings][:schema].nil? && !parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version].nil? + @containerLogSchemaVersion = parsedConfig[:log_collection_settings][:schema][:containerlog_schema_version] + puts "config::Using config map setting for container log schema version" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for container log schema version - #{errorStr}, using defaults, please check config map for errors") + end + #Get kube events enrichment setting begin if !parsedConfig[:log_collection_settings][:collect_all_kube_events].nil? && !parsedConfig[:log_collection_settings][:collect_all_kube_events][:enabled].nil? @@ -200,6 +211,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") + file.write("export AZMON_CONTAINER_LOG_SCHEMA_VERSION=#{@containerLogSchemaVersion}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " @@ -246,6 +258,8 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE', @containerLogsRoute) file.write(commands) + commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion) + file.write(commands) # Close file after writing all environment variables file.close diff --git a/kubernetes/linux/mdsd.xml b/kubernetes/linux/mdsd.xml index 76d2104fc..49d329791 100644 --- a/kubernetes/linux/mdsd.xml +++ b/kubernetes/linux/mdsd.xml @@ -48,20 +48,31 @@ --> - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - + + @@ -97,15 +108,22 @@ priority events to be delivered sooner than the next five-minute interval. --> - - - - + + + + + + + + + - @@ -118,7 +136,16 @@ - ]]> + ]]> + + + + + + + + + ]]> diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 5a678781c..0bd983297 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -32,13 +32,16 @@ import ( // DataType for Container Log const ContainerLogDataType = "CONTAINER_LOG_BLOB" +//DataType for Container Log v2 +const ContainerLogV2DataType = "CONTAINERINSIGHTS_CONTAINERLOGV2" + // DataType for Insights metric const InsightsMetricsDataType = "INSIGHTS_METRICS_BLOB" // DataType for KubeMonAgentEvent const KubeMonAgentEventDataType = "KUBE_MON_AGENT_EVENTS_BLOB" -//env varibale which has ResourceId for LA +//env variable which has ResourceId for LA const ResourceIdEnv = "AKS_RESOURCE_ID" //env variable which has ResourceName for NON-AKS @@ -78,20 +81,26 @@ const DaemonSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimpr const ReplicaSetContainerLogPluginConfFilePath = "/etc/opt/microsoft/docker-cimprov/out_oms.conf" const WindowsContainerLogPluginConfFilePath = "/etc/omsagentwindows/out_oms.conf" -// IPName for Container Log -const IPName = "Containers" +// IPName +const IPName = "ContainerInsights" + + const defaultContainerInventoryRefreshInterval = 60 const kubeMonAgentConfigEventFlushInterval = 60 //Eventsource name in mdsd -const MdsdSourceName = "ContainerLogSource" +const MdsdContainerLogSourceName = "ContainerLogSource" +const MdsdContainerLogV2SourceName = "ContainerLogV2Source" -//container logs route - v2 (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) +//container logs route (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) const ContainerLogsV2Route = "v2" const ContainerLogsADXRoute = "adx" +//container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) +const ContainerLogV2SchemaVersion = "v2" + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -125,6 +134,8 @@ var ( ContainerLogsRouteV2 bool // container log route for routing thru ADX ContainerLogsRouteADX bool + // container log schema (applicable only for non-ADX route) + ContainerLogSchemaV2 bool //ADX Cluster URI AdxClusterUri string // ADX clientID @@ -180,8 +191,8 @@ var ( userAgent = "" ) -// DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin -type DataItem struct { +// DataItemLAv1 == ContainerLog table in LA +type DataItemLAv1 struct { LogEntry string `json:"LogEntry"` LogEntrySource string `json:"LogEntrySource"` LogEntryTimeStamp string `json:"LogEntryTimeStamp"` @@ -193,10 +204,25 @@ type DataItem struct { Computer string `json:"Computer"` } +// DataItemLAv2 == ContainerLogV2 table in LA +// Please keep the names same as destination column names, to avoid transforming one to another in the pipeline +type DataItemLAv2 struct { + TimeGenerated string `json:"TimeGenerated"` + Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` + //PodLabels string `json:"PodLabels"` +} + +// DataItemADX == ContainerLogV2 table in ADX type DataItemADX struct { TimeGenerated string `json:"TimeGenerated"` Computer string `json:"Computer"` - ContainerID string `json:"ContainerID"` + ContainerId string `json:"ContainerId"` ContainerName string `json:"ContainerName"` PodName string `json:"PodName"` PodNamespace string `json:"PodNamespace"` @@ -227,10 +253,17 @@ type InsightsMetricsBlob struct { } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point -type ContainerLogBlob struct { +type ContainerLogBlobLAv1 struct { DataType string `json:"DataType"` IPName string `json:"IPName"` - DataItems []DataItem `json:"DataItems"` + DataItems []DataItemLAv1 `json:"DataItems"` +} + +// ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point +type ContainerLogBlobLAv2 struct { + DataType string `json:"DataType"` + IPName string `json:"IPName"` + DataItems []DataItemLAv2 `json:"DataItems"` } // MsgPackEntry represents the object corresponding to a single messagepack event in the messagepack stream @@ -792,7 +825,8 @@ func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int // PostDataHelper sends data to the ODS endpoint or oneagent or ADX func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { start := time.Now() - var dataItems []DataItem + var dataItemsLAv1 []DataItemLAv1 + var dataItemsLAv2 []DataItemLAv2 var dataItemsADX []DataItemADX var msgPackEntries []MsgPackEntry @@ -830,26 +864,42 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } stringMap = make(map[string]string) + //below id & name are used by latency telemetry in both v1 & v2 LA schemas + id := "" + name := "" logEntry := ToString(record["log"]) logEntryTimeStamp := ToString(record["time"]) - stringMap["LogEntry"] = logEntry - stringMap["LogEntrySource"] = logEntrySource - stringMap["LogEntryTimeStamp"] = logEntryTimeStamp - stringMap["SourceSystem"] = "Containers" - stringMap["Id"] = containerID - - if val, ok := imageIDMap[containerID]; ok { - stringMap["Image"] = val - } + //ADX Schema & LAv2 schema are almost the same (except resourceId) + if (ContainerLogSchemaV2 == true || ContainerLogsRouteADX == true) { + stringMap["Computer"] = Computer + stringMap["ContainerId"] = containerID + stringMap["ContainerName"] = containerName + stringMap["PodName"] = k8sPodName + stringMap["PodNamespace"] = k8sNamespace + stringMap["LogMessage"] = logEntry + stringMap["LogSource"] = logEntrySource + stringMap["TimeGenerated"] = logEntryTimeStamp + } else { + stringMap["LogEntry"] = logEntry + stringMap["LogEntrySource"] = logEntrySource + stringMap["LogEntryTimeStamp"] = logEntryTimeStamp + stringMap["SourceSystem"] = "Containers" + stringMap["Id"] = containerID + + if val, ok := imageIDMap[containerID]; ok { + stringMap["Image"] = val + } - if val, ok := nameIDMap[containerID]; ok { - stringMap["Name"] = val - } + if val, ok := nameIDMap[containerID]; ok { + stringMap["Name"] = val + } - stringMap["TimeOfCommand"] = start.Format(time.RFC3339) - stringMap["Computer"] = Computer - var dataItem DataItem + stringMap["TimeOfCommand"] = start.Format(time.RFC3339) + stringMap["Computer"] = Computer + } + var dataItemLAv1 DataItemLAv1 + var dataItemLAv2 DataItemLAv2 var dataItemADX DataItemADX var msgPackEntry MsgPackEntry @@ -866,50 +916,68 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } else if ContainerLogsRouteADX == true { if ResourceCentric == true { stringMap["AzureResourceId"] = ResourceID + } else { + stringMap["AzureResourceId"] = "" } - stringMap["PodName"] = k8sPodName - stringMap["PodNamespace"] = k8sNamespace - stringMap["ContainerName"] = containerName dataItemADX = DataItemADX{ - TimeGenerated: stringMap["LogEntryTimeStamp"], + TimeGenerated: stringMap["TimeGenerated"], Computer: stringMap["Computer"], - ContainerID: stringMap["Id"], + ContainerId: stringMap["ContainerId"], ContainerName: stringMap["ContainerName"], PodName: stringMap["PodName"], PodNamespace: stringMap["PodNamespace"], - LogMessage: stringMap["LogEntry"], - LogSource: stringMap["LogEntrySource"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], AzureResourceId: stringMap["AzureResourceId"], } //ADX dataItemsADX = append(dataItemsADX, dataItemADX) } else { - dataItem = DataItem{ - ID: stringMap["Id"], - LogEntry: stringMap["LogEntry"], - LogEntrySource: stringMap["LogEntrySource"], - LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], - LogEntryTimeOfCommand: stringMap["TimeOfCommand"], - SourceSystem: stringMap["SourceSystem"], - Computer: stringMap["Computer"], - Image: stringMap["Image"], - Name: stringMap["Name"], + if (ContainerLogSchemaV2 == true) { + dataItemLAv2 = DataItemLAv2{ + TimeGenerated: stringMap["TimeGenerated"], + Computer: stringMap["Computer"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], + } + //ODS-v2 schema + dataItemsLAv2 = append(dataItemsLAv2, dataItemLAv2) + name = stringMap["ContainerName"] + id = stringMap["ContainerId"] + } else { + dataItemLAv1 = DataItemLAv1{ + ID: stringMap["Id"], + LogEntry: stringMap["LogEntry"], + LogEntrySource: stringMap["LogEntrySource"], + LogEntryTimeStamp: stringMap["LogEntryTimeStamp"], + LogEntryTimeOfCommand: stringMap["TimeOfCommand"], + SourceSystem: stringMap["SourceSystem"], + Computer: stringMap["Computer"], + Image: stringMap["Image"], + Name: stringMap["Name"], + } + //ODS-v1 schema + dataItemsLAv1 = append(dataItemsLAv1, dataItemLAv1) + name = stringMap["Name"] + id = stringMap["Id"] } - //ODS - dataItems = append(dataItems, dataItem) } - if stringMap["LogEntryTimeStamp"] != "" { - loggedTime, e := time.Parse(time.RFC3339, stringMap["LogEntryTimeStamp"]) + if logEntryTimeStamp != "" { + loggedTime, e := time.Parse(time.RFC3339, logEntryTimeStamp) if e != nil { - message := fmt.Sprintf("Error while converting LogEntryTimeStamp for telemetry purposes: %s", e.Error()) + message := fmt.Sprintf("Error while converting logEntryTimeStamp for telemetry purposes: %s", e.Error()) Log(message) SendException(message) } else { ltncy := float64(start.Sub(loggedTime) / time.Millisecond) if ltncy >= maxLatency { maxLatency = ltncy - maxLatencyContainer = dataItem.Name + "=" + dataItem.ID + maxLatencyContainer = name + "=" + id } } } @@ -919,8 +987,12 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { //flush to mdsd + mdsdSourceName := MdsdContainerLogSourceName + if (ContainerLogSchemaV2 == true) { + mdsdSourceName = MdsdContainerLogV2SourceName + } fluentForward := MsgPackForward{ - Tag: MdsdSourceName, + Tag: mdsdSourceName, Entries: msgPackEntries, } @@ -967,7 +1039,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { elapsed = time.Since(start) if er != nil { - Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(dataItems), elapsed, er.Error()) + Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) if MdsdMsgpUnixSocketClient != nil { MdsdMsgpUnixSocketClient.Close() MdsdMsgpUnixSocketClient = nil @@ -1013,14 +1085,14 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } } - // Setup a maximum time for completion to be 15 Seconds. + // Setup a maximum time for completion to be 30 Seconds. ctx, cancel := context.WithTimeout(ParentContext, 30*time.Second) defer cancel() //ADXFlushMutex.Lock() //defer ADXFlushMutex.Unlock() //MultiJSON support is not there yet - if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogv2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { + if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogV2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { Log("Error when streaming to ADX Ingestion: %s", ingestionErr.Error()) //ADXIngestor = nil //not required as per ADX team. Will keep it to indicate that we tried this approach @@ -1035,58 +1107,75 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = len(dataItemsADX) Log("Success::ADX::Successfully wrote %d container log records to ADX in %s", numContainerLogRecords, elapsed) - } else { - //flush to ODS - if len(dataItems) > 0 { - logEntry := ContainerLogBlob{ - DataType: ContainerLogDataType, + } else { //ODS + var logEntry interface{} + recordType := "" + loglinesCount := 0 + //schema v2 + if (len(dataItemsLAv2) > 0 && ContainerLogSchemaV2 == true) { + logEntry = ContainerLogBlobLAv2{ + DataType: ContainerLogV2DataType, IPName: IPName, - DataItems: dataItems} - - marshalled, err := json.Marshal(logEntry) - if err != nil { - message := fmt.Sprintf("Error while Marshalling log Entry: %s", err.Error()) - Log(message) - SendException(message) - return output.FLB_OK + DataItems: dataItemsLAv2} + loglinesCount = len(dataItemsLAv2) + recordType = "ContainerLogV2" + } else { + //schema v1 + if len(dataItemsLAv1) > 0 { + logEntry = ContainerLogBlobLAv1{ + DataType: ContainerLogDataType, + IPName: IPName, + DataItems: dataItemsLAv1} + loglinesCount = len(dataItemsLAv1) + recordType = "ContainerLog" } + } - req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) - req.Header.Set("Content-Type", "application/json") - req.Header.Set("User-Agent", userAgent) - reqId := uuid.New().String() - req.Header.Set("X-Request-ID", reqId) - //expensive to do string len for every request, so use a flag - if ResourceCentric == true { - req.Header.Set("x-ms-AzureResourceId", ResourceID) - } + marshalled, err := json.Marshal(logEntry) + //Log("LogEntry::e %s", marshalled) + if err != nil { + message := fmt.Sprintf("Error while Marshalling log Entry: %s", err.Error()) + Log(message) + SendException(message) + return output.FLB_OK + } - resp, err := HTTPClient.Do(req) - elapsed = time.Since(start) + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(marshalled)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("User-Agent", userAgent) + reqId := uuid.New().String() + req.Header.Set("X-Request-ID", reqId) + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) + } + + resp, err := HTTPClient.Do(req) + elapsed = time.Since(start) - if err != nil { - message := fmt.Sprintf("Error when sending request %s \n", err.Error()) - Log(message) - // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation - //SendException(message) - Log("Failed to flush %d records after %s", len(dataItems), elapsed) + if err != nil { + message := fmt.Sprintf("Error when sending request %s \n", err.Error()) + Log(message) + // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation + //SendException(message) + + Log("Failed to flush %d records after %s", loglinesCount, elapsed) - return output.FLB_RETRY - } + return output.FLB_RETRY + } - if resp == nil || resp.StatusCode != 200 { - if resp != nil { - Log("RequestId %s Status %s Status Code %d", reqId, resp.Status, resp.StatusCode) - } - return output.FLB_RETRY + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("RequestId %s Status %s Status Code %d", reqId, resp.Status, resp.StatusCode) } + return output.FLB_RETRY + } - defer resp.Body.Close() - numContainerLogRecords = len(dataItems) - Log("PostDataHelper::Info::Successfully flushed %d container log records to ODS in %s", numContainerLogRecords, elapsed) + defer resp.Body.Close() + numContainerLogRecords = loglinesCount + Log("PostDataHelper::Info::Successfully flushed %d %s records to ODS in %s", numContainerLogRecords, recordType, elapsed) } - } ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() @@ -1374,10 +1463,22 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { CreateADXClient() } + ContainerLogSchemaVersion := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOG_SCHEMA_VERSION"))) + Log("AZMON_CONTAINER_LOG_SCHEMA_VERSION:%s", ContainerLogSchemaVersion) + + ContainerLogSchemaV2 = false //default is v1 schema + + if strings.Compare(ContainerLogSchemaVersion, ContainerLogV2SchemaVersion) == 0 && ContainerLogsRouteADX != true { + ContainerLogSchemaV2 = true + Log("Container logs schema=%s", ContainerLogV2SchemaVersion) + fmt.Fprintf(os.Stdout, "Container logs schema=%s... \n", ContainerLogV2SchemaVersion) + } + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { populateExcludedStdoutNamespaces() populateExcludedStderrNamespaces() - if enrichContainerLogs == true && ContainerLogsRouteADX != true { + //enrichment not applicable for ADX and v2 schema + if enrichContainerLogs == true && ContainerLogsRouteADX != true && ContainerLogSchemaV2 != true { Log("ContainerLogEnrichment=true; starting goroutine to update containerimagenamemaps \n") go updateContainerImageNameMaps() } else { diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 91791ae1a..61d047e52 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -145,7 +145,7 @@ func CreateADXClient() { //log.Fatalf("Unable to create ADX connection %s", err.Error()) } else { Log("Successfully created ADX Client. Creating Ingestor...") - ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogv2") + ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogV2") if ingestorErr != nil { Log("Error::mdsd::Unable to create ADX ingestor %s", ingestorErr.Error()) } else { diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 67bd61667..102cb05f2 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -25,6 +25,7 @@ class CAdvisorMetricsAPIClient @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @clusterContainerLogEnrich = ENV["AZMON_CLUSTER_CONTAINER_LOG_ENRICH"] + @clusterContainerLogSchemaVersion = ENV["AZMON_CONTAINER_LOG_SCHEMA_VERSION"] @dsPromInterval = ENV["TELEMETRY_DS_PROM_INTERVAL"] @dsPromFieldPassCount = ENV["TELEMETRY_DS_PROM_FIELDPASS_LENGTH"] @@ -247,7 +248,7 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["dsPromFDC"] = @dsPromFieldDropCount telemetryProps["dsPromUrl"] = @dsPromUrlCount end - #telemetry about containerlogs Routing for daemonset + #telemetry about containerlog Routing for daemonset if File.exist?(Constants::AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME) telemetryProps["containerLogsRoute"] = "v2" elsif (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) @@ -263,6 +264,10 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) telemetryProps["int-npm-b"] = "1" end + #telemetry for Container log schema version clusterContainerLogSchemaVersion + if (!@clusterContainerLogSchemaVersion.nil? && !@clusterContainerLogSchemaVersion.empty?) + telemetryProps["containerLogVer"] = @clusterContainerLogSchemaVersion + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end From 6031be8b71b11ec9352bc01c61195092fb589c9b Mon Sep 17 00:00:00 2001 From: Michael Sinz <36865706+Michael-Sinz@users.noreply.github.com> Date: Tue, 9 Feb 2021 09:46:29 -0800 Subject: [PATCH 069/194] Add priority class to the daemonsets (#500) * Add priority class to the daemonsets Add a priority class for omsagent and have the daemonsets use this to be sure to schedule the pods. Daemonset pods are constrained in scheduling to run on specific nodes. This is done by the daemonset controller. When a node shows up it will create a pod with a strong affinity to that node. When a node goes away, it will delete the pod with the node affinity to that node. Kubernetes pod scheduling does not know it is a daemonset but it does know it is tied to a specific node. With default scheduling, it is possible for the pods to be "frozen out" of a node because the node already is full. This can happen because "normal" pods may already exist and are looking for a node to get scheduled on when a node is added to the cluster. The daemonset controller will only first create the pod for the node at around the same time. The kubernetes scheduler is running async from all of this and thus there can be a race as to who gets scheduled on the node. The pod priority class (and thus the pod priority) is a way to indicate that the pod has a higher scheduling priority than a default pod. By default, all pods are at priority 0. Higher numbers are higher priority. Setting the priority to something greater than zero will allow the omsagent daemonsets to win a race against "normal" pods for scheduled resources on a node - and will also allow for graceful eviction in the case the node is too full. Without this, omsagent can be left out of node in clusters that are very busy, especially in dynamic scaling situations. I did not test the windows pod as we have no windows clusters. * CR feedback --- charts/azuremonitor-containers/README.md | 2 ++ .../templates/omsagent-daemonset-windows.yaml | 5 +-- .../templates/omsagent-daemonset.yaml | 13 +++---- .../templates/omsagent-priorityclass.yaml | 22 ++++++++++++ charts/azuremonitor-containers/values.yaml | 36 +++++++++++++------ 5 files changed, 59 insertions(+), 19 deletions(-) create mode 100644 charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml diff --git a/charts/azuremonitor-containers/README.md b/charts/azuremonitor-containers/README.md index 469fac94a..a3f17b509 100644 --- a/charts/azuremonitor-containers/README.md +++ b/charts/azuremonitor-containers/README.md @@ -93,6 +93,7 @@ The following table lists the configurable parameters of the MSOMS chart and the | `omsagent.env.clusterName` | Name of your cluster | Does not have a default value, needs to be provided | | `omsagent.rbac` | rbac enabled/disabled | true (i.e.enabled) | | `omsagent.proxy` | Proxy endpoint | Doesnt have default value. Refer to [configure proxy](#Configuring-Proxy-Endpoint) | +| `omsagent.priority` | DaemonSet Pod Priority | This is the [priority](https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/) to use for the daemonsets such that they get scheduled onto the node ahead of "normal" pods - must be an integer, defaults to 10 | > Note: For Azure Manage K8s clusters such as Azure Arc K8s and ARO v4, `omsagent.env.clusterId` with fully qualified azure resource id of the cluster should be used instead of `omsagent.env.clusterName` @@ -100,6 +101,7 @@ The following table lists the configurable parameters of the MSOMS chart and the - Parameter `omsagent.env.doNotCollectKubeSystemLogs` has been removed starting chart version 1.0.0. Refer to 'Agent data collection settings' section below to configure it using configmap. - onboarding of multiple clusters with the same cluster name to same log analytics workspace not supported. If need this configuration, use the cluster FQDN name rather than cluster dns prefix to avoid collision with clusterName +- The `omsagent.priority` parameter sets the priority of the omsagent daemonset priority class. This pod priority class is used for daemonsets to allow them to have priority over pods that can be scheduled elsewhere. Without a priority class, it is possible for a node to fill up with "normal" pods before the daemonset pods get to be created for the node or get scheduled. Note that pods are not "daemonset" pods - they are just pods created by the daemonset controller but they have a specific affinity set during creation to the specific node each pod was created to run on. You want this value to be greater than 0 (default is 10) and generally greater than pods that have the flexibility to run on different nodes such that they do not block the node specific pods. ## Agent data collection settings diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 81003c704..82d210f3d 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -27,10 +27,11 @@ spec: checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} spec: - dnsConfig: + priorityClassName: omsagent + dnsConfig: options: - name: ndots - value: "3" + value: "3" {{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} nodeSelector: kubernetes.io/os: windows diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 595edd7bb..0272c6263 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -28,10 +28,11 @@ spec: checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: - dnsConfig: + priorityClassName: omsagent + dnsConfig: options: - name: ndots - value: "3" + value: "3" {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} @@ -70,12 +71,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP - {{- if not (empty .Values.Azure.Extension.Name) }} + {{- if not (empty .Values.Azure.Extension.Name) }} - name: ARC_K8S_EXTENSION_NAME - value: {{ .Values.Azure.Extension.Name | quote }} - {{- end }} + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" {{- if .Values.omsagent.logsettings.logflushintervalsecs }} - name: FBIT_SERVICE_FLUSH_INTERVAL value: {{ .Values.omsagent.logsettings.logflushintervalsecs | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml new file mode 100644 index 000000000..4d9980ab3 --- /dev/null +++ b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml @@ -0,0 +1,22 @@ +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} +# This pod priority class is used for daemonsets to allow them to have priority +# over pods that can be scheduled elsewhere. Without a priority class, it is +# possible for a node to fill up with pods before the daemonset pods get to be +# created for the node or get scheduled. Note that pods are not "daemonset" +# pods - they are just pods created by the daemonset controller but they have +# a specific affinity set during creation to the specific node each pod was +# created to run on (daemonset controller takes care of that) +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: omsagent + # Priority classes don't have labels :-) + annotations: + chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + component: oms-agent +value: {{ .Values.omsagent.priority }} +globalDefault: false +description: "This is the daemonset priority class for omsagent" +{{- end }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 341b9fb65..5601a5738 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -8,9 +8,9 @@ Azure: Cluster: Region: - ResourceId: + ResourceId: Extension: - Name: "" + Name: "" ResourceId: "" omsagent: image: @@ -20,6 +20,20 @@ omsagent: pullPolicy: IfNotPresent dockerProviderVersion: "12.0.0-0" agentVersion: "1.10.0.1" + + # The priority used by the omsagent priority class for the daemonset pods + # Note that this is not execution piority - it is scheduling priority, as + # in getting scheduled to the node. This needs to be greater than 0 such + # that the daemonset pods, which can not schedule onto different nodes as + # they are defined to run on specific nodes, are not accidentally frozen + # out of a node due to other pods showing up earlier in scheduling. + # (DaemonSet pods by definition only are created once the node exists for + # them to be created for and thus it is possible to have "normal" pods + # already in line to run on the node before the DeamonSet controller got a + # chance to build pod for the node and give it to the scheduler) + # Should be some number greater than default (0) + priority: 10 + ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. @@ -61,7 +75,7 @@ omsagent: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: - matchExpressions: + matchExpressions: - key: kubernetes.io/os operator: In values: @@ -70,10 +84,10 @@ omsagent: operator: NotIn values: - virtual-kubelet - - key: kubernetes.io/arch + - key: kubernetes.io/arch operator: In values: - - amd64 + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -85,10 +99,10 @@ omsagent: operator: NotIn values: - virtual-kubelet - - key: beta.kubernetes.io/arch + - key: beta.kubernetes.io/arch operator: In values: - - amd64 + - amd64 deployment: affinity: nodeAffinity: @@ -117,10 +131,10 @@ omsagent: operator: NotIn values: - master - - key: kubernetes.io/arch + - key: kubernetes.io/arch operator: In values: - - amd64 + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -136,10 +150,10 @@ omsagent: operator: NotIn values: - master - - key: beta.kubernetes.io/arch + - key: beta.kubernetes.io/arch operator: In values: - - amd64 + - amd64 ## Configure resource requests and limits ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## From 4212e1a6ee1225f2c1280bf5b58070877cf55890 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 11 Feb 2021 09:15:23 -0800 Subject: [PATCH 070/194] fix node metric issue (#502) --- source/plugins/ruby/kubelet_utils.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index 599640d8f..bd2bd75b7 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -20,10 +20,12 @@ def get_node_capacity response = CAdvisorMetricsAPIClient.getAllMetricsCAdvisor(winNode: nil) if !response.nil? && !response.body.nil? - all_metrics = response.body.split("\n") - cpu_capacity = all_metrics.select{|m| m.start_with?('machine_cpu_cores') && m.split.first.strip == 'machine_cpu_cores' }.first.split.last.to_f * 1000 + all_metrics = response.body.split("\n") + #cadvisor machine metrics can exist with (>=1.19) or without dimensions (<1.19) + #so just checking startswith of metric name would be good enough to pick the metric value from exposition format + cpu_capacity = all_metrics.select{|m| m.start_with?('machine_cpu_cores') }.first.split.last.to_f * 1000 @log.info "CPU Capacity #{cpu_capacity}" - memory_capacity_e = all_metrics.select{|m| m.start_with?('machine_memory_bytes') && m.split.first.strip == 'machine_memory_bytes' }.first.split.last + memory_capacity_e = all_metrics.select{|m| m.start_with?('machine_memory_bytes') }.first.split.last memory_capacity = BigDecimal(memory_capacity_e).to_f @log.info "Memory Capacity #{memory_capacity}" return [cpu_capacity, memory_capacity] From 24644ce31b9a4ab003c3ebcfc4165a9d0899eaca Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 18 Feb 2021 12:53:24 -0800 Subject: [PATCH 071/194] Bug fixes for Feb release (#504) * bug fix for mdm metrics with no limits * fix exception bug --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 131 +++++++++--------- source/plugins/ruby/kubelet_utils.rb | 12 +- 2 files changed, 73 insertions(+), 70 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 102cb05f2..8cb6f603e 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -66,6 +66,7 @@ class CAdvisorMetricsAPIClient #cadvisor ports @@CADVISOR_SECURE_PORT = "10250" @@CADVISOR_NON_SECURE_PORT = "10255" + def initialize end @@ -86,40 +87,40 @@ def getPodsFromCAdvisor(winNode: nil) end def getBaseCAdvisorUri(winNode) - cAdvisorSecurePort = isCAdvisorOnSecurePort() + cAdvisorSecurePort = isCAdvisorOnSecurePort() + + if !!cAdvisorSecurePort == true + defaultHost = "https://localhost:#{@@CADVISOR_SECURE_PORT}" + else + defaultHost = "http://localhost:#{@@CADVISOR_NON_SECURE_PORT}" + end + + if !winNode.nil? + nodeIP = winNode["InternalIP"] + else + nodeIP = ENV["NODE_IP"] + end + if !nodeIP.nil? + @Log.info("Using #{nodeIP} for CAdvisor Host") if !!cAdvisorSecurePort == true - defaultHost = "https://localhost:#{@@CADVISOR_SECURE_PORT}" + return "https://#{nodeIP}:#{@@CADVISOR_SECURE_PORT}" else - defaultHost = "http://localhost:#{@@CADVISOR_NON_SECURE_PORT}" + return "http://#{nodeIP}:#{@@CADVISOR_NON_SECURE_PORT}" end - + else + @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost}") if !winNode.nil? - nodeIP = winNode["InternalIP"] - else - nodeIP = ENV["NODE_IP"] - end - - if !nodeIP.nil? - @Log.info("Using #{nodeIP} for CAdvisor Host") - if !!cAdvisorSecurePort == true - return "https://#{nodeIP}:#{@@CADVISOR_SECURE_PORT}" - else - return "http://#{nodeIP}:#{@@CADVISOR_NON_SECURE_PORT}" - end + return nil else - @Log.warn ("NODE_IP environment variable not set. Using default as : #{defaultHost}") - if !winNode.nil? - return nil - else - return defaultHost - end + return defaultHost end + end end def getCAdvisorUri(winNode, relativeUri) - baseUri = getBaseCAdvisorUri(winNode) - return baseUri + relativeUri + baseUri = getBaseCAdvisorUri(winNode) + return baseUri + relativeUri end def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) @@ -254,20 +255,20 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met elsif (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) telemetryProps["containerLogsRoute"] = @containerLogsRoute end - #telemetry about health model - if (!@hmEnabled.nil? && !@hmEnabled.empty?) + #telemetry about health model + if (!@hmEnabled.nil? && !@hmEnabled.empty?) telemetryProps["hmEnabled"] = @hmEnabled - end - #telemetry for npm integration - if (!@npmIntegrationAdvanced.nil? && !@npmIntegrationAdvanced.empty?) - telemetryProps["int-npm-a"] = "1" - elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) - telemetryProps["int-npm-b"] = "1" - end - #telemetry for Container log schema version clusterContainerLogSchemaVersion - if (!@clusterContainerLogSchemaVersion.nil? && !@clusterContainerLogSchemaVersion.empty?) + end + #telemetry for npm integration + if (!@npmIntegrationAdvanced.nil? && !@npmIntegrationAdvanced.empty?) + telemetryProps["int-npm-a"] = "1" + elsif (!@npmIntegrationBasic.nil? && !@npmIntegrationBasic.empty?) + telemetryProps["int-npm-b"] = "1" + end + #telemetry for Container log schema version clusterContainerLogSchemaVersion + if (!@clusterContainerLogSchemaVersion.nil? && !@clusterContainerLogSchemaVersion.empty?) telemetryProps["containerLogVer"] = @clusterContainerLogSchemaVersion - end + end ApplicationInsightsUtility.sendMetricTelemetry(metricNametoReturn, metricValue, telemetryProps) end end @@ -308,8 +309,8 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end if !metricInfo.nil? metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) - metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) - metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed", "containerGpumemoryUsedBytes", metricTime)) + metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle", "containerGpuDutyCycle", metricTime)) metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @@ -332,7 +333,6 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric begin metricInfo = metricJSON metricInfo["pods"].each do |pod| - podNamespace = pod["podRef"]["namespace"] excludeNamespace = false if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" @@ -356,11 +356,11 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricItem["Computer"] = hostName metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - + metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName @@ -370,7 +370,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags - + metricItems.push(metricItem) end end @@ -395,7 +395,6 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric return metricItems end - def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId @@ -415,18 +414,17 @@ def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCo if (!accelerator[metricNameToCollect].nil?) #empty check is invalid for non-strings containerName = container["name"] metricValue = accelerator[metricNameToCollect] - metricItem = {} metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName metricItem["Name"] = metricNametoReturn metricItem["Value"] = metricValue - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE - + metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = clusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName #metricTags[Constants::INSIGHTSMETRICS_TAGS_K8SNAMESPACE] = podNameSpace @@ -442,9 +440,9 @@ def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCo if (!accelerator["id"].nil? && !accelerator["id"].empty?) metricTags[Constants::INSIGHTSMETRICS_TAGS_GPU_ID] = accelerator["id"] end - + metricItem["Tags"] = metricTags - + metricItems.push(metricItem) end end @@ -921,13 +919,13 @@ def getResponse(winNode, relativeUri) uri = URI.parse(cAdvisorUri) if isCAdvisorOnSecurePort() Net::HTTP.start(uri.host, uri.port, - :use_ssl => true, :open_timeout => 20, :read_timeout => 40, - :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", - :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| - cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) - cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" - response = http.request(cAdvisorApiRequest) - @Log.info "Got response code #{response.code} from #{uri.request_uri}" + :use_ssl => true, :open_timeout => 20, :read_timeout => 40, + :ca_file => "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", + :verify_mode => OpenSSL::SSL::VERIFY_NONE) do |http| + cAdvisorApiRequest = Net::HTTP::Get.new(uri.request_uri) + cAdvisorApiRequest["Authorization"] = "Bearer #{bearerToken}" + response = http.request(cAdvisorApiRequest) + @Log.info "Got response code #{response.code} from #{uri.request_uri}" end else Net::HTTP.start(uri.host, uri.port, :use_ssl => false, :open_timeout => 20, :read_timeout => 40) do |http| @@ -940,19 +938,24 @@ def getResponse(winNode, relativeUri) rescue => error @Log.warn("CAdvisor api request for #{cAdvisorUri} failed: #{error}") telemetryProps = {} - telemetryProps["Computer"] = winNode["Hostname"] + if !winNode.nil? + hostName = winNode["Hostname"] + else + hostName = (OMS::Common.get_hostname) + end + telemetryProps["Computer"] = hostName ApplicationInsightsUtility.sendExceptionTelemetry(error, telemetryProps) end return response end def isCAdvisorOnSecurePort - cAdvisorSecurePort = false - # Check to see whether omsagent needs to use 10255(insecure) port or 10250(secure) port - if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" - cAdvisorSecurePort = true - end - return cAdvisorSecurePort + cAdvisorSecurePort = false + # Check to see whether omsagent needs to use 10255(insecure) port or 10250(secure) port + if !@cAdvisorMetricsSecurePort.nil? && @cAdvisorMetricsSecurePort == "true" + cAdvisorSecurePort = true + end + return cAdvisorSecurePort end end end diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index bd2bd75b7..e2c731b79 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -20,12 +20,12 @@ def get_node_capacity response = CAdvisorMetricsAPIClient.getAllMetricsCAdvisor(winNode: nil) if !response.nil? && !response.body.nil? - all_metrics = response.body.split("\n") - #cadvisor machine metrics can exist with (>=1.19) or without dimensions (<1.19) + all_metrics = response.body.split("\n") + #cadvisor machine metrics can exist with (>=1.19) or without dimensions (<1.19) #so just checking startswith of metric name would be good enough to pick the metric value from exposition format - cpu_capacity = all_metrics.select{|m| m.start_with?('machine_cpu_cores') }.first.split.last.to_f * 1000 + cpu_capacity = all_metrics.select { |m| m.start_with?("machine_cpu_cores") }.first.split.last.to_f * 1000 @log.info "CPU Capacity #{cpu_capacity}" - memory_capacity_e = all_metrics.select{|m| m.start_with?('machine_memory_bytes') }.first.split.last + memory_capacity_e = all_metrics.select { |m| m.start_with?("machine_memory_bytes") }.first.split.last memory_capacity = BigDecimal(memory_capacity_e).to_f @log.info "Memory Capacity #{memory_capacity}" return [cpu_capacity, memory_capacity] @@ -89,9 +89,9 @@ def get_all_container_limits @log.info "cpuLimit: #{cpuLimit}" @log.info "memoryLimit: #{memoryLimit}" # Get cpu limit in nanocores - containerCpuLimitHash[key] = !cpuLimit.nil? ? KubernetesApiClient.getMetricNumericValue("cpu", cpuLimit) : 0 + containerCpuLimitHash[key] = !cpuLimit.nil? ? KubernetesApiClient.getMetricNumericValue("cpu", cpuLimit) : nil # Get memory limit in bytes - containerMemoryLimitHash[key] = !memoryLimit.nil? ? KubernetesApiClient.getMetricNumericValue("memory", memoryLimit) : 0 + containerMemoryLimitHash[key] = !memoryLimit.nil? ? KubernetesApiClient.getMetricNumericValue("memory", memoryLimit) : nil end end end From e56104c35f2df6a1a8f0a2fa72dc7921b47fb508 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 22 Feb 2021 18:00:03 -0800 Subject: [PATCH 072/194] Gangams/feb 2021 agent bug fix (#505) * fix npe in getKubeServiceRecords * use image fields from spec * fix typo * cover all cases * handle scenario only digest specified --- source/plugins/ruby/KubernetesApiClient.rb | 2 +- .../ruby/kubernetes_container_inventory.rb | 71 ++++++++++++------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index aca2142a0..c5a363741 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -791,7 +791,7 @@ def getKubeAPIServerUrl def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) kubeServiceRecords = [] begin - if (!serviceList.nil? && !serviceList.empty?) + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? ) servicesCount = serviceList["items"].length @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") serviceList["items"].each do |item| diff --git a/source/plugins/ruby/kubernetes_container_inventory.rb b/source/plugins/ruby/kubernetes_container_inventory.rb index 69beca493..82e36c8cc 100644 --- a/source/plugins/ruby/kubernetes_container_inventory.rb +++ b/source/plugins/ruby/kubernetes_container_inventory.rb @@ -50,30 +50,7 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa if !atLocation.nil? containerInventoryRecord["ImageId"] = imageIdValue[(atLocation + 1)..-1] end - end - # image is of the format - repository/image:imagetag - imageValue = containerStatus["image"] - if !imageValue.nil? && !imageValue.empty? - # Find delimiters in the string of format repository/image:imagetag - slashLocation = imageValue.index("/") - colonLocation = imageValue.index(":") - if !colonLocation.nil? - if slashLocation.nil? - # image:imagetag - containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] - else - # repository/image:imagetag - containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] - containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] - end - containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] - end - elsif !imageIdValue.nil? && !imageIdValue.empty? - # Getting repo information from imageIdValue when no tag in ImageId - if !atLocation.nil? - containerInventoryRecord["Repository"] = imageIdValue[0..(atLocation - 1)] - end - end + end containerInventoryRecord["ExitCode"] = 0 isContainerTerminated = false isContainerWaiting = false @@ -107,6 +84,51 @@ def getContainerInventoryRecords(podItem, batchTime, clusterCollectEnvironmentVa end containerInfoMap = containersInfoMap[containerName] + # image can be in any one of below format in spec + # repository/image[:imagetag | @digest], repository/image:imagetag@digest, repo/image, image:imagetag, image@digest, image + imageValue = containerInfoMap["image"] + if !imageValue.nil? && !imageValue.empty? + # Find delimiters in image format + atLocation = imageValue.index("@") + isDigestSpecified = false + if !atLocation.nil? + # repository/image@digest or repository/image:imagetag@digest, image@digest + imageValue = imageValue[0..(atLocation - 1)] + # Use Digest from the spec's image in case when the status doesnt get populated i.e. container in pending or image pull back etc. + if containerInventoryRecord["ImageId"].nil? || containerInventoryRecord["ImageId"].empty? + containerInventoryRecord["ImageId"] = imageValue[(atLocation + 1)..-1] + end + isDigestSpecified = true + end + slashLocation = imageValue.index("/") + colonLocation = imageValue.index(":") + if !colonLocation.nil? + if slashLocation.nil? + # image:imagetag + containerInventoryRecord["Image"] = imageValue[0..(colonLocation - 1)] + else + # repository/image:imagetag + containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] + containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..(colonLocation - 1)] + end + containerInventoryRecord["ImageTag"] = imageValue[(colonLocation + 1)..-1] + else + if slashLocation.nil? + # image + containerInventoryRecord["Image"] = imageValue + else + # repo/image + containerInventoryRecord["Repository"] = imageValue[0..(slashLocation - 1)] + containerInventoryRecord["Image"] = imageValue[(slashLocation + 1)..-1] + end + # if no tag specified, k8s assumes latest as imagetag and this is same behavior from docker API and from status. + # Ref - https://kubernetes.io/docs/concepts/containers/images/#image-names + if isDigestSpecified == false + containerInventoryRecord["ImageTag"] = "latest" + end + end + end + podName = containerInfoMap["PodName"] namespace = containerInfoMap["Namespace"] # containername in the format what docker sees @@ -165,6 +187,7 @@ def getContainersInfoMap(podItem, isWindows) podContainers.each do |container| containerInfoMap = {} containerName = container["name"] + containerInfoMap["image"] = container["image"] containerInfoMap["ElementName"] = containerName containerInfoMap["Computer"] = nodeName containerInfoMap["PodName"] = podName From e00b2aabf9609f76b9ce13c3397cc290e0318dd9 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 23 Feb 2021 13:08:51 -0800 Subject: [PATCH 073/194] changes for release -ciprod02232021 (#506) --- ReleaseNotes.md | 17 +++++++++++++++-- build/version | 4 ++-- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ kubernetes/windows/Dockerfile | 2 +- .../onboarding/managed/enable-monitoring.ps1 | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- .../onboarding/managed/upgrade-monitoring.sh | 2 +- 10 files changed, 32 insertions(+), 19 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index b1eb316a1..80d6f188d 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -10,6 +10,20 @@ additional questions or comments. ## Release History Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 02/23/2021 - +##### Version microsoft/oms:ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021 (linux) +##### Version microsoft/oms:win-ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021 (windows) +##### Code change log +- ContainerLogV2 schema support for LogAnalytics & ADX (not usable externally yet) +- Fix nodemetrics (cpuusageprecentage & memoryusagepercentage) metrics not flowing. This is fixed upstream for k8s versions >= 1.19.7 and >=1.20.2. +- Fix cpu & memory usage exceeded threshold container metrics not flowing when requests and/or limits were not set +- Mute some unused exceptions from going to telemetry +- Collect containerimage (repository, image & imagetag) from spec (instead of runtime) +- Add support for extension MSI for k8s arc +- Use cloud specific instrumentation keys for telemetry +- Picked up newer version for apt +- Add priority class to daemonset (in our chart only) + ### 01/11/2021 - ##### Version microsoft/oms:ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021 (linux) ##### Version microsoft/oms:win-ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021 (windows) @@ -27,7 +41,6 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Enable ADX route for windows container logs - Remove logging to termination log in windows agent liveness probe - ### 11/09/2020 - ##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020 (linux) ##### Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod11092020 (windows) @@ -36,7 +49,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ### 10/27/2020 - ##### Version microsoft/oms:ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10272020 (linux) -##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10052020 (windows) +##### Version microsoft/oms:win-ciprod10272020 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10272020 (windows) ##### Code change log - Activate oneagent in few AKS regions (koreacentral,norwayeast) - Disable syslog diff --git a/build/version b/build/version index 711a96921..2da3efa39 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=12 +CONTAINER_BUILDVERSION_MAJOR=13 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210111 +CONTAINER_BUILDVERSION_DATE=20210223 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index a809a4e69..ce64fd1ce 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.8.0 +version: 2.8.1 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 5601a5738..410f5d3c2 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -15,10 +15,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod01112021" - tagWindows: "win-ciprod01112021" + tag: "ciprod02232021" + tagWindows: "win-ciprod02232021" pullPolicy: IfNotPresent - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" agentVersion: "1.10.0.1" # The priority used by the omsagent priority class for the daemonset pods diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 2e1118922..bee718a31 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod01112021 +ARG IMAGE_TAG=ciprod02232021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 67bd9cdde..cafd9b904 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" imagePullPolicy: IfNotPresent resources: limits: @@ -521,13 +521,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" imagePullPolicy: IfNotPresent resources: limits: @@ -675,7 +675,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "12.0.0-0" + dockerProviderVersion: "13.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -685,7 +685,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index f852bd236..d4f118449 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod01112021 +ARG IMAGE_TAG=win-ciprod02232021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 45ddb44b0..db035b13d 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -64,7 +64,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.8.0" +$mcrChartVersion = "2.8.1" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." $omsAgentDomainName="opinsights.azure.com" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 2dc0a465f..9d0c0aca5 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -44,7 +44,7 @@ defaultAzureCloud="AzureCloud" omsAgentDomainName="opinsights.azure.com" # released chart version in mcr -mcrChartVersion="2.8.0" +mcrChartVersion="2.8.1" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 8826b6df6..6d14dfa5f 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.8.0" +mcrChartVersion="2.8.1" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" From 31f0e5f50f63e08c70dab8a0e78a804b4a09e8bd Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 23 Feb 2021 15:38:14 -0800 Subject: [PATCH 074/194] Gangams/e2e test framework (#503) * add agent e2e fw and tests * doc and script updates * add validation script * doc updates * yaml updates * fix typo * doc updates * more doc updates * add ISTEST for helm chart to use arc conf * refactor test code * fix pr feedback * fix pr feedback * fix pr feedback * fix pr feedback --- .../update-place-holdres-in-e2e-tests.sh | 35 ++ .pipelines/validate-e2e-tests-results.sh | 71 +++ README.md | 31 ++ .../templates/omsagent-daemonset.yaml | 2 + .../templates/omsagent-deployment.yaml | 4 +- charts/azuremonitor-containers/values.yaml | 4 + kubernetes/omsagent.yaml | 6 + source/plugins/ruby/in_kube_events.rb | 4 + source/plugins/ruby/in_kube_nodes.rb | 21 + source/plugins/ruby/in_kube_podinventory.rb | 12 + source/plugins/ruby/in_kube_pvinventory.rb | 5 +- test/e2e/e2e-tests.yaml | 178 ++++++++ test/e2e/src/common/arm_rest_utility.py | 25 + test/e2e/src/common/constants.py | 119 +++++ test/e2e/src/common/helm_utility.py | 68 +++ .../common/kubernetes_configmap_utility.py | 8 + test/e2e/src/common/kubernetes_crd_utility.py | 27 ++ .../common/kubernetes_daemonset_utility.py | 36 ++ .../common/kubernetes_deployment_utility.py | 38 ++ .../common/kubernetes_namespace_utility.py | 32 ++ .../e2e/src/common/kubernetes_node_utility.py | 12 + test/e2e/src/common/kubernetes_pod_utility.py | 65 +++ .../src/common/kubernetes_secret_utility.py | 26 ++ .../src/common/kubernetes_service_utility.py | 19 + .../src/common/kubernetes_version_utility.py | 9 + test/e2e/src/common/results_utility.py | 24 + test/e2e/src/core/Dockerfile | 17 + test/e2e/src/core/conftest.py | 90 ++++ test/e2e/src/core/e2e_tests.sh | 26 ++ test/e2e/src/core/helper.py | 429 ++++++++++++++++++ test/e2e/src/core/pytest.ini | 4 + test/e2e/src/tests/test_ds_workflows.py | 60 +++ test/e2e/src/tests/test_e2e_workflows.py | 330 ++++++++++++++ .../tests/test_node_metrics_e2e_workflow.py | 420 +++++++++++++++++ .../tests/test_pod_metrics_e2e_workflow.py | 134 ++++++ test/e2e/src/tests/test_resource_status.py | 43 ++ test/e2e/src/tests/test_rs_workflows.py | 93 ++++ 37 files changed, 2525 insertions(+), 2 deletions(-) create mode 100755 .pipelines/update-place-holdres-in-e2e-tests.sh create mode 100644 .pipelines/validate-e2e-tests-results.sh create mode 100644 test/e2e/e2e-tests.yaml create mode 100644 test/e2e/src/common/arm_rest_utility.py create mode 100644 test/e2e/src/common/constants.py create mode 100644 test/e2e/src/common/helm_utility.py create mode 100644 test/e2e/src/common/kubernetes_configmap_utility.py create mode 100644 test/e2e/src/common/kubernetes_crd_utility.py create mode 100644 test/e2e/src/common/kubernetes_daemonset_utility.py create mode 100644 test/e2e/src/common/kubernetes_deployment_utility.py create mode 100644 test/e2e/src/common/kubernetes_namespace_utility.py create mode 100644 test/e2e/src/common/kubernetes_node_utility.py create mode 100644 test/e2e/src/common/kubernetes_pod_utility.py create mode 100644 test/e2e/src/common/kubernetes_secret_utility.py create mode 100644 test/e2e/src/common/kubernetes_service_utility.py create mode 100644 test/e2e/src/common/kubernetes_version_utility.py create mode 100644 test/e2e/src/common/results_utility.py create mode 100644 test/e2e/src/core/Dockerfile create mode 100644 test/e2e/src/core/conftest.py create mode 100644 test/e2e/src/core/e2e_tests.sh create mode 100755 test/e2e/src/core/helper.py create mode 100644 test/e2e/src/core/pytest.ini create mode 100755 test/e2e/src/tests/test_ds_workflows.py create mode 100755 test/e2e/src/tests/test_e2e_workflows.py create mode 100755 test/e2e/src/tests/test_node_metrics_e2e_workflow.py create mode 100755 test/e2e/src/tests/test_pod_metrics_e2e_workflow.py create mode 100755 test/e2e/src/tests/test_resource_status.py create mode 100755 test/e2e/src/tests/test_rs_workflows.py diff --git a/.pipelines/update-place-holdres-in-e2e-tests.sh b/.pipelines/update-place-holdres-in-e2e-tests.sh new file mode 100755 index 000000000..5fec73684 --- /dev/null +++ b/.pipelines/update-place-holdres-in-e2e-tests.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo "start: update placeholders of e2e-tests.yaml ..." + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + TENANT_ID) TENANT_ID=$VALUE ;; + *) + esac +done + +echo "start: read appid and appsecret" +# used the same SP which used for acr +CLIENT_ID=$(cat ~/acrappid) +CLIENT_SECRET=$(cat ~/acrappsecret) +echo "end: read appid and appsecret" + +echo "Service Principal CLIENT_ID:$CLIENT_ID" +echo "replace CLIENT_ID value" +sed -i "s=SP_CLIENT_ID_VALUE=$CLIENT_ID=g" e2e-tests.yaml + +# only uncomment for debug purpose +# echo "Service Principal CLIENT_SECRET:$CLIENT_SECRET" +echo "replace CLIENT_SECRET value" +sed -i "s=SP_CLIENT_SECRET_VALUE=$CLIENT_SECRET=g" e2e-tests.yaml + +echo "Service Principal TENANT_ID:$TENANT_ID" +echo "replace TENANT_ID value" +sed -i "s=SP_TENANT_ID_VALUE=$TENANT_ID=g" e2e-tests.yaml + +echo "end: update placeholders of e2e-tests.yaml." diff --git a/.pipelines/validate-e2e-tests-results.sh b/.pipelines/validate-e2e-tests-results.sh new file mode 100644 index 000000000..c38fa0f50 --- /dev/null +++ b/.pipelines/validate-e2e-tests-results.sh @@ -0,0 +1,71 @@ +#!/bin/bash +echo "start: validating results of e2e-tests ..." +DEFAULT_SONOBUOY_VERSION="0.20.0" +DEFAULT_TIME_OUT_IN_MINS=60 +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + VALUE=$(echo $ARGUMENT | cut -f2 -d=) + + case "$KEY" in + SONOBUOY_VERSION) SONOBUOY_VERSION=$VALUE ;; + *) + esac +done + +if [ -z $SONOBUOY_VERSION ]; then + SONOBUOY_VERSION=$DEFAULT_SONOBUOY_VERSION +fi + +echo "sonobuoy version: ${SONOBUOY_VERSION}" + +echo "start: downloading sonobuoy" +curl -LO https://github.com/vmware-tanzu/sonobuoy/releases/download/v${SONOBUOY_VERSION}/sonobuoy_${SONOBUOY_VERSION}_linux_amd64.tar.gz +echo "end: downloading sonobuoy" + +echo "start: extract sonobuoy tar file" +mkdir -p sonobuoy-install/ +tar -zxf sonobuoy_${SONOBUOY_VERSION}_*.tar.gz -C sonobuoy-install/ +echo "end: extract sonobuoy tar file" + +echo "start: move sonobuoy binaries to /usr/local/bin/" +mv -f sonobuoy-install/sonobuoy /usr/local/bin/ +echo "end: move sonobuoy binaries to /usr/local/bin/" + +rm -rf sonobuoy_${SONOBUOY_VERSION}_*.tar.gz sonobuoy-install/ + +results=$(sonobuoy retrieve) +mins=0 +IsSucceeded=true +while [ $mins -le $DEFAULT_TIME_OUT_IN_MINS ] +do + # check the status + echo "checking test status" + status=$(sonobuoy status) + status=$(echo $status | sed 's/`//g') + if [[ $status == *"completed"* ]]; then + echo "test run completed" + mins=$DEFAULT_TIME_OUT_IN_MINS + if [[ $status == *"failed"* ]]; then + IsSucceeded=false + fi + else + echo "sleep for 1m to check the status again" + sleep 1m + fi + mins=$(( $mins + 1 )) +done +echo "status:${IsSucceeded}" + +results=$(sonobuoy retrieve) +sonobuoy results $results + +if $IsSucceeded == true; then + echo "all test passed" + exit 0 +else + echo "tests are failed. please review the results by downloading tar file via sonobuoy retrieve command" + exit 1 +fi + +echo "end: validating results of e2e-tests ..." diff --git a/README.md b/README.md index 3eec1f344..3564345ee 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ The general directory structure is: │ │ | ... - plugins in, out and filters code in ruby │ ├── toml-parser/ - code for parsing of toml configuration files ├── test/ - source code for tests +│ ├── e2e/ - e2e tests to validate agent and e2e workflow(s) │ ├── unit-tests/ - unit tests code │ ├── scenario/ - scenario tests code ├── !_README.md - this file @@ -271,6 +272,36 @@ For DEV and PROD branches, automatically deployed latest yaml with latest agent # E2E Tests +## For executing tests + +1. Deploy the omsagent.yaml with your agent image. In the yaml, make sure `ISTEST` environment variable set to `true` if its not set already +2. Update the Service Principal CLIENT_ID, CLIENT_SECRET and TENANT_ID placeholder values and apply e2e-tests.yaml to execute the tests + > Note: Service Principal requires reader role on log analytics workspace and cluster resource to query LA and metrics + ``` + cd ~/Docker-Provider/test/e2e # based on your repo path + kubectl apply -f e2e-tests.yaml # this will trigger job to run the tests in sonobuoy namespace + kubectl get po -n sonobuoy # to check the pods and jobs associated to tests + ``` +3. Download (sonobuoy)[https://github.com/vmware-tanzu/sonobuoy/releases] on your dev box to view the results of the tests + ``` + results=$(sonobuoy retrieve) # downloads tar file which has logs and test results + sonobuoy results $results # get the summary of the results + tar -xzvf # extract downloaded tar file and look for pod logs, results and other k8s resources if there are any failures + ``` + +## For adding new tests + +1. Add the test python file with your test code under `tests` directory +2. Build the docker image, recommended to use ACR & MCR + ``` + cd ~/Docker-Provider/test/e2e/src # based on your repo path + docker login -u -p # login to acr + docker build -f ./core/Dockerfile -t /: . + docker push /: + ``` +3. update existing agentest image tag in e2e-tests.yaml with newly built image tag with MCR repo + +# Scenario Tests Clusters are used in release pipeline already has the yamls under test\scenario deployed. Make sure to validate these scenarios. If you have new interesting scenarios, please add/update them. diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 0272c6263..615cd0485 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -89,6 +89,8 @@ spec: - name: FBIT_TAIL_BUFFER_MAX_SIZE value: {{ .Values.omsagent.logsettings.tailbufmaxsizemegabytes | quote }} {{- end }} + - name: ISTEST + value: {{ .Values.omsagent.ISTEST | quote }} securityContext: privileged: true ports: diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index ecd0b705b..012dd2720 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -72,7 +72,9 @@ spec: value: {{ .Values.Azure.Extension.Name | quote }} {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" + - name: ISTEST + value: {{ .Values.omsagent.ISTEST | quote }} securityContext: privileged: true ports: diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 410f5d3c2..5831c9889 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -33,6 +33,10 @@ omsagent: # chance to build pod for the node and give it to the scheduler) # Should be some number greater than default (0) priority: 10 + + # This used for running agent pods in test mode. + # if set to true additional agent workflow logs will be emitted which are used for e2e and arc k8s conformance testing + ISTEST: false ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index cafd9b904..ebf0257af 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -383,6 +383,9 @@ spec: value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" + # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests + - name: ISTEST + value: "true" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" @@ -541,6 +544,9 @@ spec: value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" + # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests + - name: ISTEST + value: "true" # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 4f6017cc5..f50019a01 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -129,6 +129,7 @@ def enumerate def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f + @@istestvar = ENV["ISTEST"] begin eventStream = MultiEventStream.new events["items"].each do |items| @@ -171,6 +172,9 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim @eventsCount += 1 end router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeEventsInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end rescue => errorStr $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 0a4727077..c803c0fa2 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -188,6 +188,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end # node metrics records @@ -217,6 +220,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodePerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end # node GPU metrics record @@ -249,6 +255,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end # Adding telemetry to send node telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs @@ -300,23 +309,35 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) router.emit_stream(@tag, eventStream) if eventStream $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end eventStream = nil end if containerNodeInventoryEventStream.count > 0 $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end if kubePerfEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodePerfInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end if insightsMetricsEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 0cff2eefe..5256eb159 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -265,6 +265,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end kubePerfEventStream = MultiEventStream.new end @@ -306,6 +309,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end if insightsMetricsEventStream.count > 0 @@ -345,6 +351,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream kubeServicesEventStream = MultiEventStream.new + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end end end @@ -352,6 +361,9 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if kubeServicesEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end kubeServicesEventStream = nil end diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 861b3a8e1..4efe86f61 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -106,7 +106,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f eventStream = MultiEventStream.new - + @@istestvar = ENV["ISTEST"] begin records = [] pvInventory["items"].each do |item| @@ -156,6 +156,9 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end router.emit_stream(@tag, eventStream) if eventStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubePVInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end rescue => errorStr $log.warn "Failed in parse_and_emit_record for in_kube_pvinventory: #{errorStr}" diff --git a/test/e2e/e2e-tests.yaml b/test/e2e/e2e-tests.yaml new file mode 100644 index 000000000..06dfa1fb0 --- /dev/null +++ b/test/e2e/e2e-tests.yaml @@ -0,0 +1,178 @@ + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: sonobuoy +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + component: sonobuoy + name: sonobuoy-serviceaccount + namespace: sonobuoy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + component: sonobuoy + namespace: sonobuoy + name: sonobuoy-serviceaccount-sonobuoy +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sonobuoy-serviceaccount-sonobuoy +subjects: +- kind: ServiceAccount + name: sonobuoy-serviceaccount + namespace: sonobuoy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + component: sonobuoy + namespace: sonobuoy + name: sonobuoy-serviceaccount-sonobuoy +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - '*' +- nonResourceURLs: + - '/metrics' + - '/logs' + - '/logs/*' + verbs: + - 'get' +--- +apiVersion: v1 +data: + config.json: | + {"Description":"DEFAULT","UUID":"bf5c02ed-1948-48f1-b12d-5a2d74435e46","Version":"v0.20.0","ResultsDir":"/tmp/sonobuoy","Resources":["apiservices","certificatesigningrequests","clusterrolebindings","clusterroles","componentstatuses","configmaps","controllerrevisions","cronjobs","customresourcedefinitions","daemonsets","deployments","endpoints","ingresses","jobs","leases","limitranges","mutatingwebhookconfigurations","namespaces","networkpolicies","nodes","persistentvolumeclaims","persistentvolumes","poddisruptionbudgets","pods","podlogs","podsecuritypolicies","podtemplates","priorityclasses","replicasets","replicationcontrollers","resourcequotas","rolebindings","roles","servergroups","serverversion","serviceaccounts","services","statefulsets","storageclasses","validatingwebhookconfigurations","volumeattachments"],"Filters":{"Namespaces":".*","LabelSelector":""},"Limits":{"PodLogs":{"Namespaces":"","SonobuoyNamespace":true,"FieldSelectors":[],"LabelSelector":"","Previous":false,"SinceSeconds":null,"SinceTime":null,"Timestamps":false,"TailLines":null,"LimitBytes":null,"LimitSize":"","LimitTime":""}},"QPS":30,"Burst":50,"Server":{"bindaddress":"0.0.0.0","bindport":8080,"advertiseaddress":"","timeoutseconds":10800},"Plugins":null,"PluginSearchPath":["./plugins.d","/etc/sonobuoy/plugins.d","~/sonobuoy/plugins.d"],"Namespace":"sonobuoy","WorkerImage":"sonobuoy/sonobuoy:v0.20.0","ImagePullPolicy":"IfNotPresent","ImagePullSecrets":"","ProgressUpdatesPort":"8099"} +kind: ConfigMap +metadata: + labels: + component: sonobuoy + name: sonobuoy-config-cm + namespace: sonobuoy +--- +apiVersion: v1 +data: + plugin-0.yaml: | + podSpec: + containers: [] + restartPolicy: Never + serviceAccountName: sonobuoy-serviceaccount + nodeSelector: + kubernetes.io/os: linux + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - key: CriticalAddonsOnly + operator: Exists + - key: kubernetes.io/e2e-evict-taint-key + operator: Exists + sonobuoy-config: + driver: Job + plugin-name: agenttests + result-format: junit + spec: + env: + # Update values of CLIENT_ID, CLIENT_SECRET of the service principal which has permission to query LA ad Metrics API + # Update value of TENANT_ID corresponding your Azure Service principal + - name: CLIENT_ID + value: "SP_CLIENT_ID_VALUE" + - name: CLIENT_SECRET + value: "CLIENT_SECRET_VALUE" + - name: TENANT_ID + value: "SP_TENANT_ID_VALUE" + - name: DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES + value: "10" + - name: DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES + value: "10" + - name: AGENT_POD_EXPECTED_RESTART_COUNT + value: "0" + - name: AZURE_CLOUD + value: "AZURE_PUBLIC_CLOUD" + # image tag should be updated if new tests being added after this image + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciagenttest02152021 + imagePullPolicy: IfNotPresent + name: plugin + resources: {} + volumeMounts: + - mountPath: /tmp/results + name: results +kind: ConfigMap +metadata: + labels: + component: sonobuoy + name: sonobuoy-plugins-cm + namespace: sonobuoy +--- +apiVersion: v1 +kind: Pod +metadata: + labels: + component: sonobuoy + run: sonobuoy-master + sonobuoy-component: aggregator + tier: analysis + name: sonobuoy + namespace: sonobuoy +spec: + containers: + - env: + - name: SONOBUOY_ADVERTISE_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: sonobuoy/sonobuoy:v0.20.0 + imagePullPolicy: IfNotPresent + name: kube-sonobuoy + volumeMounts: + - mountPath: /etc/sonobuoy + name: sonobuoy-config-volume + - mountPath: /plugins.d + name: sonobuoy-plugins-volume + - mountPath: /tmp/sonobuoy + name: output-volume + restartPolicy: Never + serviceAccountName: sonobuoy-serviceaccount + nodeSelector: + kubernetes.io/os: linux + tolerations: + - key: "kubernetes.io/e2e-evict-taint-key" + operator: "Exists" + volumes: + - configMap: + name: sonobuoy-config-cm + name: sonobuoy-config-volume + - configMap: + name: sonobuoy-plugins-cm + name: sonobuoy-plugins-volume + - emptyDir: {} + name: output-volume +--- +apiVersion: v1 +kind: Service +metadata: + labels: + component: sonobuoy + sonobuoy-component: aggregator + name: sonobuoy-aggregator + namespace: sonobuoy +spec: + ports: + - port: 8080 + protocol: TCP + targetPort: 8080 + selector: + sonobuoy-component: aggregator + type: ClusterIP + diff --git a/test/e2e/src/common/arm_rest_utility.py b/test/e2e/src/common/arm_rest_utility.py new file mode 100644 index 000000000..604f8b791 --- /dev/null +++ b/test/e2e/src/common/arm_rest_utility.py @@ -0,0 +1,25 @@ +import adal +import pytest + +from msrestazure.azure_active_directory import AADTokenCredentials + + +# Function to fetch aad token from spn id and password +def fetch_aad_token(client_id, client_secret, authority_uri, resource_uri): + """ + Authenticate using service principal w/ key. + """ + try: + context = adal.AuthenticationContext(authority_uri, api_version=None) + return context.acquire_token_with_client_credentials(resource_uri, client_id, client_secret) + except Exception as e: + pytest.fail("Error occured while fetching aad token: " + str(e)) + + +# Function that returns aad token credentials for a given spn +def fetch_aad_token_credentials(client_id, client_secret, authority_uri, resource_uri): + mgmt_token = fetch_aad_token(client_id, client_secret, authority_uri, resource_uri) + try: + return AADTokenCredentials(mgmt_token, client_id) + except Exception as e: + pytest.fail("Error occured while fetching credentials: " + str(e)) diff --git a/test/e2e/src/common/constants.py b/test/e2e/src/common/constants.py new file mode 100644 index 000000000..770964cb5 --- /dev/null +++ b/test/e2e/src/common/constants.py @@ -0,0 +1,119 @@ +AZURE_PUBLIC_CLOUD_ENDPOINTS = { + "activeDirectory": "https://login.microsoftonline.com/", + "activeDirectoryDataLakeResourceId": "https://datalake.azure.net/", + "activeDirectoryGraphResourceId": "https://graph.windows.net/", + "activeDirectoryResourceId": "https://management.core.windows.net/", + "appInsights": "https://api.applicationinsights.io", + "appInsightsTelemetryChannel": "https://dc.applicationinsights.azure.com/v2/track", + "batchResourceId": "https://batch.core.windows.net/", + "gallery": "https://gallery.azure.com/", + "logAnalytics": "https://api.loganalytics.io", + "management": "https://management.core.windows.net/", + "mediaResourceId": "https://rest.media.azure.net", + "microsoftGraphResourceId": "https://graph.microsoft.com/", + "ossrdbmsResourceId": "https://ossrdbms-aad.database.windows.net", + "resourceManager": "https://management.azure.com/", + "sqlManagement": "https://management.core.windows.net:8443/", + "vmImageAliasDoc": "https://raw.githubusercontent.com/Azure/azure-rest-api-specs/master/arm-compute/quickstart-templates/aliases.json" +} + +AZURE_DOGFOOD_ENDPOINTS = { + "activeDirectory": "https://login.windows-ppe.net/", + "activeDirectoryDataLakeResourceId": None, + "activeDirectoryGraphResourceId": "https://graph.ppe.windows.net/", + "activeDirectoryResourceId": "https://management.core.windows.net/", + "appInsights": None, + "appInsightsTelemetryChannel": None, + "batchResourceId": None, + "gallery": "https://df.gallery.azure-test.net/", + "logAnalytics": None, + "management": "https://management-preview.core.windows-int.net/", + "mediaResourceId": None, + "microsoftGraphResourceId": None, + "ossrdbmsResourceId": None, + "resourceManager": "https://api-dogfood.resources.windows-int.net/", + "sqlManagement": None, + "vmImageAliasDoc": None +} + +AZURE_CLOUD_DICT = {"AZURE_PUBLIC_CLOUD" : AZURE_PUBLIC_CLOUD_ENDPOINTS, "AZURE_DOGFOOD": AZURE_DOGFOOD_ENDPOINTS} + +TIMEOUT = 300 + +# Azure Monitor for Container Extension related +AGENT_RESOURCES_NAMESPACE = 'kube-system' +AGENT_DEPLOYMENT_NAME = 'omsagent-rs' +AGENT_DAEMONSET_NAME = 'omsagent' +AGENT_WIN_DAEMONSET_NAME = 'omsagent-win' + +AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR = 'rsName=omsagent-rs' +AGENT_DAEMON_SET_PODS_LABEL_SELECTOR = 'component=oms-agent' +AGENT_OMSAGENT_LOG_PATH = '/var/opt/microsoft/omsagent/log/omsagent.log' +AGENT_REPLICASET_WORKFLOWS = ["kubePodInventoryEmitStreamSuccess", "kubeNodeInventoryEmitStreamSuccess"] + +# override this through setting enviornment variable if the expected restart count is > 0 for example applying configmap +AGENT_POD_EXPECTED_RESTART_COUNT = 0 + +# replicaset workflow streams +KUBE_POD_INVENTORY_EMIT_STREAM = "kubePodInventoryEmitStreamSuccess" +KUBE_NODE_INVENTORY_EMIT_STREAM = "kubeNodeInventoryEmitStreamSuccess" +KUBE_DEPLOYMENT_INVENTORY_EMIT_STREAM = "kubestatedeploymentsInsightsMetricsEmitStreamSuccess" +KUBE_CONTAINER_PERF_EMIT_STREAM = "kubeContainerPerfEventEmitStreamSuccess" +KUBE_SERVICES_EMIT_STREAM = "kubeServicesEventEmitStreamSuccess" +KUBE_CONTAINER_NODE_INVENTORY_EMIT_STREAM = "containerNodeInventoryEmitStreamSuccess" +KUBE_EVENTS_EMIT_STREAM = "kubeEventsInventoryEmitStreamSuccess" +# daemonset workflow streams +CONTAINER_PERF_EMIT_STREAM = "cAdvisorPerfEmitStreamSuccess" +CONTAINER_INVENTORY_EMIT_STREAM = "containerInventoryEmitStreamSuccess" + +# simple log analytics queries to validate for e2e workflows +DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES = 10 +KUBE_POD_INVENTORY_QUERY = "KubePodInventory | where TimeGenerated > ago({0}) | count" +KUBE_NODE_INVENTORY_QUERY = "KubeNodeInventory | where TimeGenerated > ago({0}) | count" +KUBE_SERVICES_QUERY = "KubeServices | where TimeGenerated > ago({0}) | count" +KUBE_EVENTS_QUERY = "KubeEvents | where TimeGenerated > ago({0}) | count" +CONTAINER_NODE_INVENTORY_QUERY = "ContainerNodeInventory | where TimeGenerated > ago({0}) | count" +CONTAINER_INVENTORY_QUERY = "ContainerInventory | where TimeGenerated > ago({0}) | count" +# node perf +NODE_PERF_CPU_CAPCITY_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuCapacityNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_CAPCITY_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryCapacityBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_CPU_ALLOCATABLE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuAllocatableNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_ALLOCATABLE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryAllocatableBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_CPU_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'cpuUsageNanoCores' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_RSS_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'memoryRssBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_MEMORY_WS_USAGE_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName =='memoryWorkingSetBytes' | where TimeGenerated > ago({0}) | count" +NODE_PERF_RESTART_TIME_EPOCH_QUERY = "Perf | where ObjectName == 'K8SNode' | where CounterName == 'restartTimeEpoch' | where TimeGenerated > ago({0}) | count" +# container perf +CONTAINER_PERF_CPU_LIMITS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuLimitNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_LIMITS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryLimitBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_CPU_REQUESTS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuRequestNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_REQUESTS_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryRequestBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_CPU_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'cpuUsageNanoCores' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_RSS_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryRssBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_MEMORY_WS_USAGE_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'memoryWorkingSetBytes' | where TimeGenerated > ago({0}) | count" +CONTAINER_PERF_RESTART_TIME_EPOCH_QUERY = "Perf | where ObjectName == 'K8SContainer' | where CounterName == 'restartTimeEpoch' | where TimeGenerated > ago({0}) | count" +# container log +CONTAINER_LOG_QUERY = "ContainerLog | where TimeGenerated > ago({0}) | count" +# insights metrics +INSIGHTS_METRICS_QUERY = "InsightsMetrics | where TimeGenerated > ago({0}) | count" + +# custom metrics +METRICS_API_VERSION = '2019-07-01' +DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES = 10 + +# node metrics +NODE_METRICS_NAMESPACE = 'insights.container/nodes' +NODE_METRIC_METRIC_AGGREGATION = 'average' +NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME = 'cpuUsageMilliCores' +NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME = 'cpuUsagePercentage' +NODE_MEMORY_RSS_METRIC_NAME = 'memoryRssBytes' +NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME = 'memoryRssPercentage' +NODE_MEMORY_WS_METRIC_NAME = 'memoryWorkingSetBytes' +NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME = 'memoryWorkingSetPercentage' +NODE_COUNT_METRIC_NAME = 'nodesCount' +NODE_DISK_USAGE_PERCENTAGE_METRIC_NAME = 'diskUsedPercentage(Preview)' + +# pod metrics +POD_METRICS_NAMESPACE = 'insights.container/pods' +POD_METRIC_METRIC_AGGREGATION = 'average' +POD_COUNT_METRIC_NAME = 'PodCount' diff --git a/test/e2e/src/common/helm_utility.py b/test/e2e/src/common/helm_utility.py new file mode 100644 index 000000000..6eac1e071 --- /dev/null +++ b/test/e2e/src/common/helm_utility.py @@ -0,0 +1,68 @@ +import os +import pytest +import subprocess + + +# Function to pull helm charts +def pull_helm_chart(registry_path): + os.environ['HELM_EXPERIMENTAL_OCI'] = '1' + cmd_helm_chart_pull = ["helm", "chart", "pull", registry_path] + response_helm_chart_pull = subprocess.Popen(cmd_helm_chart_pull, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_chart_pull, error_helm_chart_pull = response_helm_chart_pull.communicate() + if response_helm_chart_pull.returncode != 0: + pytest.fail("Unable to pull helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_pull.decode("ascii")) + return output_helm_chart_pull.decode("ascii") + + +# Function to export helm charts +def export_helm_chart(registry_path, destination): + cmd_helm_chart_export = ["helm", "chart", "export", registry_path, "--destination", destination] + response_helm_chart_export = subprocess.Popen(cmd_helm_chart_export, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_chart_export, error_helm_chart_export = response_helm_chart_export.communicate() + if response_helm_chart_export.returncode != 0: + pytest.fail("Unable to export helm chart from the registry '{}': ".format(registry_path) + error_helm_chart_export.decode("ascii")) + return output_helm_chart_export.decode("ascii") + + +# Function to add a helm repository +def add_helm_repo(repo_name, repo_url): + cmd_helm_repo = ["helm", "repo", "add", repo_name, repo_url] + response_helm_repo = subprocess.Popen(cmd_helm_repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_repo, error_helm_repo = response_helm_repo.communicate() + if response_helm_repo.returncode != 0: + pytest.fail("Unable to add repository {} to helm: ".format(repo_url) + error_helm_repo.decode("ascii")) + return output_helm_repo.decode("ascii") + + +# Function to install helm charts +def install_helm_chart(helm_release_name, helm_release_namespace, helm_chart_path, wait=False, **kwargs): + cmd_helm_install = ["helm", "install", helm_release_name, helm_chart_path, "--namespace", helm_release_namespace] + if wait: + cmd_helm_install.extend(["--wait"]) + for key, value in kwargs.items(): + cmd_helm_install.extend(["--set", "{}={}".format(key, value)]) + response_helm_install = subprocess.Popen(cmd_helm_install, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_install, error_helm_install = response_helm_install.communicate() + if response_helm_install.returncode != 0: + pytest.fail("Unable to install helm release: " + error_helm_install.decode("ascii")) + return output_helm_install.decode("ascii") + + +# Function to delete helm chart +def delete_helm_release(helm_release_name, helm_release_namespace): + cmd_helm_delete = ["helm", "delete", helm_release_name, "--namespace", helm_release_namespace] + response_helm_delete = subprocess.Popen(cmd_helm_delete, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_delete, error_helm_delete = response_helm_delete.communicate() + if response_helm_delete.returncode != 0: + pytest.fail("Error occured while deleting the helm release: " + error_helm_delete.decode("ascii")) + return output_helm_delete.decode("ascii") + + +# Function to list helm release +def list_helm_release(helm_release_namespace): + cmd_helm_list = ["helm", "list", "--namespace", helm_release_namespace] + response_helm_list = subprocess.Popen(cmd_helm_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_helm_list, error_helm_list = response_helm_list.communicate() + if response_helm_list.returncode != 0: + pytest.fail("Error occured while fetching the helm release: " + error_helm_list.decode("ascii")) + return output_helm_list.decode("ascii") diff --git a/test/e2e/src/common/kubernetes_configmap_utility.py b/test/e2e/src/common/kubernetes_configmap_utility.py new file mode 100644 index 000000000..caee9628e --- /dev/null +++ b/test/e2e/src/common/kubernetes_configmap_utility.py @@ -0,0 +1,8 @@ +import pytest + + +def get_namespaced_configmap(api_instance, namespace, configmap_name): + try: + return api_instance.read_namespaced_config_map(configmap_name, namespace) + except Exception as e: + pytest.fail("Error occured when retrieving configmap: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_crd_utility.py b/test/e2e/src/common/kubernetes_crd_utility.py new file mode 100644 index 000000000..f84092878 --- /dev/null +++ b/test/e2e/src/common/kubernetes_crd_utility.py @@ -0,0 +1,27 @@ +import pytest + +from kubernetes import watch + + +# Function to get the CRD instance +def get_crd_instance(api_instance, group, version, namespace, plural, crd_name): + try: + return api_instance.get_namespaced_custom_object(group, version, namespace, plural, crd_name) + except Exception as e: + pytest.fail("Error occurred when retrieving crd information: " + str(e)) + + +# Function that watches events corresponding to given CRD instance and passes the events to a callback function +def watch_crd_instance(api_instance, group, version, namespace, plural, crd_name, timeout, callback=None): + if not callback: + pytest.fail("callback should be specified") + + field_selector = "metadata.name={}".format(crd_name) if crd_name else "" + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_custom_object, group, version, namespace, plural, field_selector=field_selector, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when watching crd instance events: " + str(e)) + pytest.fail("The watch on the crd instance events has timed out.") diff --git a/test/e2e/src/common/kubernetes_daemonset_utility.py b/test/e2e/src/common/kubernetes_daemonset_utility.py new file mode 100644 index 000000000..dd76a11d9 --- /dev/null +++ b/test/e2e/src/common/kubernetes_daemonset_utility.py @@ -0,0 +1,36 @@ +import pytest +from kubernetes import watch + +# Returns a list of daemon_sets in a given namespace +def list_daemon_set(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_daemon_set(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving daemon_sets: " + str(e)) + +# Deletes a daemon_set +def delete_daemon_set(api_instance, namespace, daemon_set_name): + try: + return api_instance.delete_namespaced_daemon_set(daemon_set_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting daemon_set: " + str(e)) + +# Read a daemon_set +def read_daemon_set(api_instance, namespace, daemon_set_name): + try: + return api_instance.read_namespaced_daemon_set(daemon_set_name, namespace) + except Exception as e: + pytest.fail("Error occured when reading daemon_set: " + str(e)) + +# Function that watches events corresponding to daemon_sets in the given namespace and passes the events to a callback function +def watch_daemon_set_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_daemon_set, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + print("Error occurred when checking daemon_set status: " + str(e)) + print("The watch on the daemon_set status has timed out. Please see the pod logs for more info.") diff --git a/test/e2e/src/common/kubernetes_deployment_utility.py b/test/e2e/src/common/kubernetes_deployment_utility.py new file mode 100644 index 000000000..1be7a6b71 --- /dev/null +++ b/test/e2e/src/common/kubernetes_deployment_utility.py @@ -0,0 +1,38 @@ +import pytest +from kubernetes import watch + +# Returns a list of deployments in a given namespace +def list_deployment(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_deployment(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving deployments: " + str(e)) + +# Deletes a deployment +def delete_deployment(api_instance, namespace, deployment_name): + try: + return api_instance.delete_namespaced_deployment(deployment_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting deployment: " + str(e)) + + +# Read a deployment +def read_deployment(api_instance, namespace, deployment_name): + try: + return api_instance.read_namespaced_deployment(deployment_name, namespace) + except Exception as e: + pytest.fail("Error occured when reading deployment: " + str(e)) + +# Function that watches events corresponding to deployments in the given namespace and passes the events to a callback function +def watch_deployment_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_deployment, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + print("Error occurred when checking deployment status: " + str(e)) + print("The watch on the deployment status has timed out. Please see the pod logs for more info.") + \ No newline at end of file diff --git a/test/e2e/src/common/kubernetes_namespace_utility.py b/test/e2e/src/common/kubernetes_namespace_utility.py new file mode 100644 index 000000000..cea5788c5 --- /dev/null +++ b/test/e2e/src/common/kubernetes_namespace_utility.py @@ -0,0 +1,32 @@ +import pytest +from kubernetes import watch + + +# Function that watches events corresponding to kubernetes namespaces and passes the events to a callback function +def watch_namespace(api_instance, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when checking namespace status: " + str(e)) + pytest.fail("The watch on the namespaces has timed out.") + + +# Function to list all kubernetes namespaces +def list_namespace(api_instance): + try: + return api_instance.list_namespace() + except Exception as e: + pytest.fail("Error occured when retrieving namespaces: " + str(e)) + + +# Function to delete a kubernetes namespaces +def delete_namespace(api_instance, namespace_name): + try: + return api_instance.delete_namespace(namespace_name) + except Exception as e: + pytest.fail("Error occured when deleting namespace: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_node_utility.py b/test/e2e/src/common/kubernetes_node_utility.py new file mode 100644 index 000000000..050ce8b87 --- /dev/null +++ b/test/e2e/src/common/kubernetes_node_utility.py @@ -0,0 +1,12 @@ +import pytest + +def get_kubernetes_node_count(api_instance): + node_list = list_kubernetes_nodes(api_instance) + return len(node_list.items) + +def list_kubernetes_nodes(api_instance): + try: + return api_instance.list_node() + except Exception as e: + pytest.fail("Error occured while retrieving node information: " + str(e)) + diff --git a/test/e2e/src/common/kubernetes_pod_utility.py b/test/e2e/src/common/kubernetes_pod_utility.py new file mode 100644 index 000000000..27345fae7 --- /dev/null +++ b/test/e2e/src/common/kubernetes_pod_utility.py @@ -0,0 +1,65 @@ +import pytest +import time + +from kubernetes import watch +from kubernetes.stream import stream + +# Returns a kubernetes pod object in given namespace. Object description at: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodList.md +def get_pod(api_instance, namespace, pod_name): + try: + return api_instance.read_namespaced_pod(pod_name, namespace) + except Exception as e: + pytest.fail("Error occured when retrieving pod information: " + str(e)) + + +# Returns a list of kubernetes pod objects in a given namespace. Object description at: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodList.md +def get_pod_list(api_instance, namespace, label_selector=""): + try: + return api_instance.list_namespaced_pod(namespace, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occurred when retrieving pod information: " + str(e)) + +# get the content of the log file in the container via exec +def get_log_file_content(api_instance, namespace, podName, logfilePath): + try: + exec_command = ['tar','cf', '-', logfilePath] + return stream(api_instance.connect_get_namespaced_pod_exec, podName, namespace, command=exec_command, stderr=True, stdin=False, stdout=True, tty=False) + except Exception as e: + pytest.fail("Error occurred when retrieving log file content: " + str(e)) + +# Function that watches events corresponding to pods in the given namespace and passes the events to a callback function +def watch_pod_status(api_instance, namespace, timeout, callback=None): + if not callback: + return + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_pod, namespace, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + pytest.fail("Error occurred when checking pod status: " + str(e)) + pytest.fail("The watch on the pods has timed out. Please see the pod logs for more info.") + + +# Function that watches events corresponding to pod logs and passes them to a callback function +def watch_pod_logs(api_instance, namespace, pod_name, container_name, timeout_seconds, callback=None): + if not callback: + return + try: + w = watch.Watch() + timeout = time.time() + timeout_seconds + for event in w.stream(api_instance.read_namespaced_pod_log, pod_name, namespace, container=container_name): + if callback(event): + return + if time.time() > timeout: + pytest.fail("The watch on the pod logs has timed out.") + except Exception as e: + pytest.fail("Error occurred when checking pod logs: " + str(e)) + + +# Function that returns the pod logs of a given container. +def get_pod_logs(api_instance, pod_namespace, pod_name, container_name): + try: + return api_instance.read_namespaced_pod_log(pod_name, pod_namespace, container=container_name) + except Exception as e: + pytest.fail("Error occurred when fetching pod logs: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_secret_utility.py b/test/e2e/src/common/kubernetes_secret_utility.py new file mode 100644 index 000000000..8cc07fd4d --- /dev/null +++ b/test/e2e/src/common/kubernetes_secret_utility.py @@ -0,0 +1,26 @@ +import sys + +from kubernetes import watch + + +# This function returns the kubernetes secret object present in a given namespace +def get_kubernetes_secret(api_instance, namespace, secret_name): + try: + return api_instance.read_namespaced_secret(secret_name, namespace) + except Exception as e: + sys.exit("Error occurred when retrieving secret '{}': ".format(secret_name) + str(e)) + + +# Function that watches events corresponding to kubernetes secrets and passes the events to a callback function +def watch_kubernetes_secret(api_instance, namespace, secret_name, timeout, callback=None): + if not callback: + return + field_selector = "metadata.name={}".format(secret_name) if secret_name else "" + try: + w = watch.Watch() + for event in w.stream(api_instance.list_namespaced_secret, namespace, field_selector=field_selector, timeout_seconds=timeout): + if callback(event): + return + except Exception as e: + sys.exit("Error occurred when watching kubernetes secret events: " + str(e)) + sys.exit("The watch on the kubernetes secret events has timed out. Please see the pod logs for more info.") diff --git a/test/e2e/src/common/kubernetes_service_utility.py b/test/e2e/src/common/kubernetes_service_utility.py new file mode 100644 index 000000000..694af885a --- /dev/null +++ b/test/e2e/src/common/kubernetes_service_utility.py @@ -0,0 +1,19 @@ +import pytest + +from kubernetes import watch + + +# Returns a list of services in a given namespace +def list_service(api_instance, namespace, field_selector="", label_selector=""): + try: + return api_instance.list_namespaced_service(namespace, field_selector=field_selector, label_selector=label_selector) + except Exception as e: + pytest.fail("Error occured when retrieving services: " + str(e)) + + +# Deletes a service +def delete_service(api_instance, namespace, service_name): + try: + return api_instance.delete_namespaced_service(service_name, namespace) + except Exception as e: + pytest.fail("Error occured when deleting service: " + str(e)) diff --git a/test/e2e/src/common/kubernetes_version_utility.py b/test/e2e/src/common/kubernetes_version_utility.py new file mode 100644 index 000000000..884d1df2f --- /dev/null +++ b/test/e2e/src/common/kubernetes_version_utility.py @@ -0,0 +1,9 @@ +import pytest + + +def get_kubernetes_server_version(api_instance): + try: + api_response = api_instance.get_code() + return api_response.git_version + except Exception as e: + pytest.fail("Error occured when retrieving kubernetes server version: " + str(e)) diff --git a/test/e2e/src/common/results_utility.py b/test/e2e/src/common/results_utility.py new file mode 100644 index 000000000..14066bf16 --- /dev/null +++ b/test/e2e/src/common/results_utility.py @@ -0,0 +1,24 @@ +import pytest +import shutil +import tarfile + +from pathlib import Path + + + +# Function to create the test result directory +def create_results_dir(results_dir): + print(results_dir) + try: + Path(results_dir).mkdir(parents=True, exist_ok=True) + except Exception as e: + pytest.fail("Unable to create the results directory: " + str(e)) + + +# Function to append logs from the test run into a result file +def append_result_output(message, result_file_path): + try: + with open(result_file_path, "a") as result_file: + result_file.write(message) + except Exception as e: + pytest.fail("Error while appending message '{}' to results file: ".format(message) + str(e)) diff --git a/test/e2e/src/core/Dockerfile b/test/e2e/src/core/Dockerfile new file mode 100644 index 000000000..9f85bdf4c --- /dev/null +++ b/test/e2e/src/core/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.6 + +RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org pytest pytest-xdist filelock requests kubernetes adal msrestazure + +RUN curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \ + && helm version + +COPY ./core/e2e_tests.sh / +COPY ./core/pytest.ini /e2etests/ +COPY ./core/conftest.py /e2etests/ +COPY ./core/helper.py /e2etests/ +COPY ./core/ /e2etests/ +COPY ./common/ /e2etests/ +COPY ./tests/ /e2etests/ + +RUN ["chmod", "+x", "/e2e_tests.sh"] +ENTRYPOINT ["./e2e_tests.sh"] diff --git a/test/e2e/src/core/conftest.py b/test/e2e/src/core/conftest.py new file mode 100644 index 000000000..e659d5189 --- /dev/null +++ b/test/e2e/src/core/conftest.py @@ -0,0 +1,90 @@ +import pytest +import os +import time +import pickle + +import constants + +from filelock import FileLock +from pathlib import Path +from results_utility import create_results_dir, append_result_output + +pytestmark = pytest.mark.agentests + +# Fixture to collect all the environment variables, install pre-requisites. It will be run before the tests. +@pytest.fixture(scope='session', autouse=True) +def env_dict(): + my_file = Path("env.pkl") # File to store the environment variables. + with FileLock(str(my_file) + ".lock"): # Locking the file since each test will be run in parallel as separate subprocesses and may try to access the file simultaneously. + env_dict = {} + if not my_file.is_file(): + # Creating the results directory + create_results_dir('/tmp/results') + + # Setting some environment variables + env_dict['SETUP_LOG_FILE'] = '/tmp/results/setup' + env_dict['TEST_AGENT_LOG_FILE'] = '/tmp/results/containerinsights' + env_dict['NUM_TESTS_COMPLETED'] = 0 + + print("Starting setup...") + append_result_output("Starting setup...\n", env_dict['SETUP_LOG_FILE']) + + # Collecting environment variables + env_dict['TENANT_ID'] = os.getenv('TENANT_ID') + env_dict['CLIENT_ID'] = os.getenv('CLIENT_ID') + env_dict['CLIENT_SECRET'] = os.getenv('CLIENT_SECRET') + + # get default query time interval for log analytics queries + queryTimeInterval = int(os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES + # add minute suffix since this format required for LA queries + env_dict['DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES'] = str(queryTimeInterval) + "m" + + # get default query time interval for metrics queries + env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] = int(os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES + + + # expected agent pod restart count + env_dict['AGENT_POD_EXPECTED_RESTART_COUNT'] = int(os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT')) if os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT') else constants.AGENT_POD_EXPECTED_RESTART_COUNT + + # default to azure public cloud if AZURE_CLOUD not specified + env_dict['AZURE_ENDPOINTS'] = constants.AZURE_CLOUD_DICT.get(os.getenv('AZURE_CLOUD')) if os.getenv('AZURE_CLOUD') else constants.AZURE_PUBLIC_CLOUD_ENDPOINTS + + if not env_dict.get('TENANT_ID'): + pytest.fail('ERROR: variable TENANT_ID is required.') + + if not env_dict.get('CLIENT_ID'): + pytest.fail('ERROR: variable CLIENT_ID is required.') + + if not env_dict.get('CLIENT_SECRET'): + pytest.fail('ERROR: variable CLIENT_SECRET is required.') + + print("Setup Complete.") + append_result_output("Setup Complete.\n", env_dict['SETUP_LOG_FILE']) + + with Path.open(my_file, "wb") as f: + pickle.dump(env_dict, f, pickle.HIGHEST_PROTOCOL) + else: + with Path.open(my_file, "rb") as f: + env_dict = pickle.load(f) + + yield env_dict + + my_file = Path("env.pkl") + with FileLock(str(my_file) + ".lock"): + with Path.open(my_file, "rb") as f: + env_dict = pickle.load(f) + + env_dict['NUM_TESTS_COMPLETED'] = 1 + env_dict.get('NUM_TESTS_COMPLETED') + if env_dict['NUM_TESTS_COMPLETED'] == int(os.getenv('NUM_TESTS')): + # Checking if cleanup is required. + if os.getenv('SKIP_CLEANUP'): + return + print('Starting cleanup...') + append_result_output("Starting Cleanup...\n", env_dict['SETUP_LOG_FILE']) + + print("Cleanup Complete.") + append_result_output("Cleanup Complete.\n", env_dict['SETUP_LOG_FILE']) + return + + with Path.open(my_file, "wb") as f: + pickle.dump(env_dict, f, pickle.HIGHEST_PROTOCOL) diff --git a/test/e2e/src/core/e2e_tests.sh b/test/e2e/src/core/e2e_tests.sh new file mode 100644 index 000000000..3bfafdce9 --- /dev/null +++ b/test/e2e/src/core/e2e_tests.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +results_dir="${RESULTS_DIR:-/tmp/results}" + +# saveResults prepares the results for handoff to the Sonobuoy worker. +# See: https://github.com/vmware-tanzu/sonobuoy/blob/master/docs/plugins.md +saveResults() { + cd ${results_dir} + + # Sonobuoy worker expects a tar file. + tar czf results.tar.gz * + + # Signal to the worker that we are done and where to find the results. + printf ${results_dir}/results.tar.gz > ${results_dir}/done +} + +# Ensure that we tell the Sonobuoy worker we are done regardless of results. +trap saveResults EXIT + +# The variable 'TEST_LIST' should be provided if we want to run specific tests. If not provided, all tests are run + +NUM_PROCESS=$(pytest /e2etests/ --collect-only -k "$TEST_NAME_LIST" -m "$TEST_MARKER_LIST" | grep " 0): + pytest.fail("numberMisscheduled shouldnt be greater than 0 for the daemonset {}.".format( + daemonset_name)) + + except Exception as e: + pytest.fail("Error occured while checking daemonset status: " + str(e)) + +# This function checks the status of kubernetes pods +def check_kubernetes_pods_status(pod_namespace, label_selector, expectedPodRestartCount, outfile=None): + try: + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, pod_namespace, label_selector) + append_result_output("podlist output {}\n".format(pod_list), outfile) + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + pods = pod_list.items + if not pods: + pytest.fail("pod items shouldnt be null or empty") + if len(pods) <= 0: + pytest.fail("pod count should be greater than 0") + for pod in pods: + status = pod.status + podstatus = status.phase + if not podstatus: + pytest.fail("status should not be null or empty") + if podstatus != "Running": + pytest.fail("pod status should be in running state") + containerStatuses = status.container_statuses + if not containerStatuses: + pytest.fail("containerStatuses shouldnt be nil or empty") + if len(containerStatuses) <= 0: + pytest.fail("length containerStatuses should be greater than 0") + for containerStatus in containerStatuses: + containerId = containerStatus.container_id + if not containerId: + pytest.fail("containerId shouldnt be nil or empty") + image = containerStatus.image + if not image: + pytest.fail("image shouldnt be nil or empty") + imageId = containerStatus.image_id + if not imageId: + pytest.fail("imageId shouldnt be nil or empty") + restartCount = containerStatus.restart_count + if restartCount > expectedPodRestartCount: + pytest.fail("restartCount shouldnt be greater than expected pod restart count: {}".format(expectedPodRestartCount)) + ready = containerStatus.ready + if not ready: + pytest.fail("container status should be in ready state") + containerState = containerStatus.state + if not containerState.running: + pytest.fail("container state should be in running state") + except Exception as e: + pytest.fail("Error occured while checking pods status: " + str(e)) + + +def check_namespace_status_using_watch(outfile=None, namespace_list=None, timeout=300): + namespace_dict = {} + for namespace in namespace_list: + namespace_dict[namespace] = 0 + append_result_output( + "Namespace dict: {}\n".format(namespace_dict), outfile) + print("Generated the namespace dictionary.") + + # THe callback function to check the namespace status + def namespace_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + namespace_name = event['raw_object'].get('metadata').get('name') + namespace_status = event['raw_object'].get('status') + if not namespace_status: + return False + if namespace_status.get('phase') == 'Active': + namespace_dict[namespace_name] = 1 + if all(ele == 1 for ele in list(namespace_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing the namespace event: " + str(e)) + + # Checking the namespace status + api_instance = client.CoreV1Api() + watch_namespace(api_instance, timeout, namespace_event_callback) + +# This function checks the status of daemonset in a given namespace. The daemonset to be monitored are identified using the pod label list parameter. +def check_kubernetes_daemonset_status_using_watch(daemonset_namespace, outfile=None, daemonset_label_list=None, timeout=300): + daemonset_label_dict = {} + if daemonset_label_list: # This parameter is a list of label values to identify the daemonsets that we want to monitor in the given namespace + for daemonset_label in daemonset_label_list: + daemonset_label_dict[daemonset_label] = 0 + append_result_output("daemonset label dict: {}\n".format( + daemonset_label_dict), outfile) + print("Generated the daemonset dictionary.") + + # The callback function to check if the pod is in running state + def daemonset_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + daemonset_status = event['raw_object'].get('status') + daemonset_metadata = event['raw_object'].get('metadata') + daemonset_metadata_labels = daemonset_metadata.get('labels') + if not daemonset_metadata_labels: + return False + + # It contains the list of all label values for the pod whose event was called. + daemonset_metadata_label_values = daemonset_metadata_labels.values() + # This label value will be common in pod event and label list provided and will be monitored + current_label_value = None + for label_value in daemonset_metadata_label_values: + if label_value in daemonset_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + currentNumberScheduled = daemonset_status.get( + 'currentNumberScheduled') + desiredNumberScheduled = daemonset_status.get( + 'desiredNumberScheduled') + numberAvailable = daemonset_status.get('numberAvailable') + numberReady = daemonset_status.get('numberReady') + numberMisscheduled = daemonset_status.get('numberMisscheduled') + + if (currentNumberScheduled != desiredNumberScheduled): + pytest.fail("currentNumberScheduled doesnt match with currentNumberScheduled for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + if (numberAvailable != numberReady): + pytest.fail("numberAvailable doesnt match with expected numberReady for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + if (numberMisscheduled > 0): + pytest.fail("numberMisscheduled is greater than 0 for the daemonset {}.".format( + daemonset_metadata.get('name'))) + + return True + except Exception as e: + print("Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if daemonset_label_dict: + api_instance = client.AppsV1Api() + watch_daemon_set_status( + api_instance, daemonset_namespace, timeout, daemonset_event_callback) + +# This function checks the status of deployment in a given namespace. The deployment to be monitored are identified using the pod label list parameter. +def check_kubernetes_deployments_status_using_watch(deployment_namespace, outfile=None, deployment_label_list=None, timeout=300): + deployment_label_dict = {} + if deployment_label_list: # This parameter is a list of label values to identify the deployments that we want to monitor in the given namespace + for deployment_label in deployment_label_list: + deployment_label_dict[deployment_label] = 0 + append_result_output("Deployment label dict: {}\n".format( + deployment_label_dict), outfile) + print("Generated the deployment dictionary.") + + # The callback function to check if the pod is in running state + def deployment_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + deployment_status = event['raw_object'].get('status') + deployment_metadata = event['raw_object'].get('metadata') + deployment_metadata_labels = deployment_metadata.get('labels') + if not deployment_metadata_labels: + return False + + # It contains the list of all label values for the deployment whose event was called. + deployment_metadata_label_values = deployment_metadata_labels.values() + # This label value will be common in deployment event and label list provided and will be monitored + current_label_value = None + for label_value in deployment_metadata_label_values: + if label_value in deployment_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + availableReplicas = deployment_status.get('availableReplicas') + readyReplicas = deployment_status.get('readyReplicas') + replicas = deployment_status.get('replicas') + + if (replicas != availableReplicas): + pytest.fail("availableReplicas doesnt match with expected replicas for the deployment {}.".format( + deployment_metadata.get('name'))) + + if (replicas != readyReplicas): + pytest.fail("readyReplicas doesnt match with expected replicas for the deployment {}.".format( + deployment_metadata.get('name'))) + + return True + except Exception as e: + print("Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if deployment_label_dict: + api_instance = client.AppsV1Api() + watch_deployment_status( + api_instance, deployment_namespace, timeout, deployment_event_callback) + +# This function checks the status of pods in a given namespace. The pods to be monitored are identified using the pod label list parameter. +def check_kubernetes_pods_status_using_watch(pod_namespace, outfile=None, pod_label_list=None, timeout=300): + pod_label_dict = {} + if pod_label_list: # This parameter is a list of label values to identify the pods that we want to monitor in the given namespace + for pod_label in pod_label_list: + pod_label_dict[pod_label] = 0 + append_result_output( + "Pod label dict: {}\n".format(pod_label_dict), outfile) + print("Generated the pods dictionary.") + + # The callback function to check if the pod is in running state + def pod_event_callback(event): + try: + # append_result_output("{}\n".format(event), outfile) + pod_status = event['raw_object'].get('status') + pod_metadata = event['raw_object'].get('metadata') + pod_metadata_labels = pod_metadata.get('labels') + if not pod_metadata_labels: + return False + + # It contains the list of all label values for the pod whose event was called. + pod_metadata_label_values = pod_metadata_labels.values() + # This label value will be common in pod event and label list provided and will be monitored + current_label_value = None + for label_value in pod_metadata_label_values: + if label_value in pod_label_dict: + current_label_value = label_value + if not current_label_value: + return False + + if pod_status.get('containerStatuses'): + for container in pod_status.get('containerStatuses'): + if container.get('restartCount') > 0: + pytest.fail("The pod {} was restarted. Please see the pod logs for more info.".format( + container.get('name'))) + if not container.get('state').get('running'): + pod_label_dict[current_label_value] = 0 + return False + else: + pod_label_dict[current_label_value] = 1 + if all(ele == 1 for ele in list(pod_label_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing the pod event: " + str(e)) + + # Checking status of all pods + if pod_label_dict: + api_instance = client.CoreV1Api() + watch_pod_status(api_instance, pod_namespace, + timeout, pod_event_callback) + + +# Function to check if the crd instance status has been updated with the status fields mentioned in the 'status_list' parameter +def check_kubernetes_crd_status_using_watch(crd_group, crd_version, crd_namespace, crd_plural, crd_name, status_dict={}, outfile=None, timeout=300): + # The callback function to check if the crd event received has been updated with the status fields + def crd_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + crd_status = event['raw_object'].get('status') + if not crd_status: + return False + for status_field in status_dict: + if not crd_status.get(status_field): + return False + if crd_status.get(status_field) != status_dict.get(status_field): + pytest.fail( + "The CRD instance status has been updated with incorrect value for '{}' field.".format(status_field)) + return True + except Exception as e: + pytest.fail("Error occured while processing crd event: " + str(e)) + + # Checking if CRD instance has been updated with status fields + api_instance = client.CustomObjectsApi() + watch_crd_instance(api_instance, crd_group, crd_version, crd_namespace, + crd_plural, crd_name, timeout, crd_event_callback) + + +# Function to monitor the pod logs. It will ensure that are logs passed in the 'log_list' parameter are present in the container logs. +def check_kubernetes_pod_logs_using_watch(pod_namespace, pod_name, container_name, logs_list=None, error_logs_list=None, outfile=None, timeout=300): + logs_dict = {} + for log in logs_list: + logs_dict[log] = 0 + print("Generated the logs dictionary.") + + # The callback function to examine the pod log + def pod_log_event_callback(event): + try: + append_result_output("{}\n".format(event), outfile) + for error_log in error_logs_list: + if error_log in event: + pytest.fail("Error log found: " + event) + for log in logs_dict: + if log in event: + logs_dict[log] = 1 + if all(ele == 1 for ele in list(logs_dict.values())): + return True + return False + except Exception as e: + pytest.fail( + "Error occured while processing pod log event: " + str(e)) + + # Checking the pod logs + api_instance = client.CoreV1Api() + watch_pod_logs(api_instance, pod_namespace, pod_name, + container_name, timeout, pod_log_event_callback) + +# Function to monitor the kubernetes secret. It will determine if the secret has been successfully created. +def check_kubernetes_secret_using_watch(secret_namespace, secret_name, timeout=300): + # The callback function to check if the secret event received has secret data + def secret_event_callback(event): + try: + secret_data = event['raw_object'].get('data') + if not secret_data: + return False + return True + except Exception as e: + pytest.fail( + "Error occured while processing secret event: " + str(e)) + + # Checking the kubernetes secret + api_instance = client.CoreV1Api() + watch_kubernetes_secret(api_instance, secret_namespace, + secret_name, timeout, secret_event_callback) diff --git a/test/e2e/src/core/pytest.ini b/test/e2e/src/core/pytest.ini new file mode 100644 index 000000000..f4dc462f0 --- /dev/null +++ b/test/e2e/src/core/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + agentests: marks tests are a part of arc agent conformance tests (deselect with '-m "not agentests"') + \ No newline at end of file diff --git a/test/e2e/src/tests/test_ds_workflows.py b/test/e2e/src/tests/test_ds_workflows.py new file mode 100755 index 000000000..81ef08325 --- /dev/null +++ b/test/e2e/src/tests/test_ds_workflows.py @@ -0,0 +1,60 @@ +import pytest +import constants + +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list, get_log_file_content +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status +from kubernetes.stream import stream + +pytestmark = pytest.mark.agentests + +# validation of ds agent workflows +def test_ds_workflows(env_dict): + print("Starting daemonset agent workflows test.") + append_result_output("test_ds_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + print("getting daemonset pod list") + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DAEMON_SET_PODS_LABEL_SELECTOR) + if not pod_list: + pytest.fail("daemonset pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in daemonset pod list should be greater than 0") + + for podItem in pod_list.items: + podName = podItem.metadata.name + logcontent = get_log_file_content( + api_instance, constants.AGENT_RESOURCES_NAMESPACE, podName, constants.AGENT_OMSAGENT_LOG_PATH) + if not logcontent: + pytest.fail("logcontent should not be null or empty for pod: " + podName) + loglines = logcontent.split("\n") + if len(loglines) <= 0: + pytest.fail("number of log lines should be greater than 0 for pod :" + podName) + + IsContainerPerfEmitStream = False + IsContainerInventoryStream = False + for line in loglines: + if line.find(constants.CONTAINER_PERF_EMIT_STREAM) >= 0: + IsContainerPerfEmitStream = True + if line.find(constants.CONTAINER_INVENTORY_EMIT_STREAM) >= 0: + IsContainerInventoryStream = True + + if IsContainerPerfEmitStream == False: + pytest.fail("ContainerPerf stream not emitted successfully from pod:" + podName) + if IsContainerInventoryStream == False: + pytest.fail("ContainerInventory stream not emitted successfully from pod:" + podName) + + append_result_output("test_ds_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed daemonset workflows test.") diff --git a/test/e2e/src/tests/test_e2e_workflows.py b/test/e2e/src/tests/test_e2e_workflows.py new file mode 100755 index 000000000..11a8e18e3 --- /dev/null +++ b/test/e2e/src/tests/test_e2e_workflows.py @@ -0,0 +1,330 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output + + +pytestmark = pytest.mark.agentests + +# validation of workflows e2e +def test_e2e_workflows(env_dict): + print("Starting e2e workflows test.") + append_result_output("test_e2e_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for LA queries + queryTimeInterval = env_dict['DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not queryTimeInterval: + pytest.fail("DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail("environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail("failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for log analytics resource for the queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get('activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resource = env_dict.get('AZURE_ENDPOINTS').get('logAnalytics') + aad_token = fetch_aad_token(client_id, client_secret, authority_uri, resource) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate e2e workflows by checking data in log analytics workspace through resource centric queries + queryUrl = resource + "/v1" + clusterResourceId + "/query" + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json" + } + # KubePodInventory + query = constants.KUBE_POD_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_POD_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} and workflow: {1}".format(clusterResourceId, 'KUBE_POD_INVENTORY')) + + # KubeNodeInventory + query = constants.KUBE_NODE_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_NODE_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_NODE_INVENTORY')) + + # KubeServices + query = constants.KUBE_SERVICES_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_SERVICES')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_SERVICES')) + + # KubeEvents + query = constants.KUBE_EVENTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('KUBE_EVENTS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'KUBE_EVENTS')) + + # Container Node Inventory + query = constants.CONTAINER_NODE_INVENTORY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_NODE_INVENTORY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_NODE_INVENTORY')) + + # Node Perf + # cpu capacity + query = constants.NODE_PERF_CPU_CAPCITY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_CAPCITY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_CAPCITY')) + + # memory capacity + query = constants.NODE_PERF_MEMORY_CAPCITY_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_CAPCITY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_CAPCITY')) + + # cpu allocatable + query = constants.NODE_PERF_CPU_ALLOCATABLE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_ALLOCATABLE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_ALLOCATABLE')) + + # memory allocatable + query = constants.NODE_PERF_MEMORY_ALLOCATABLE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_ALLOCATABLE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_ALLOCATABLE')) + + # cpu usage + query = constants.NODE_PERF_CPU_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_CPU_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_CPU_USAGE')) + + # memory rss usage + query = constants.NODE_PERF_MEMORY_RSS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_RSS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_RSS_USAGE')) + + # memory ws usage + query = constants.NODE_PERF_MEMORY_WS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_MEMORY_WS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_MEMORY_WS_USAGE')) + + # restartime epoch + query = constants.NODE_PERF_RESTART_TIME_EPOCH_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('NODE_PERF_RESTART_TIME_EPOCH')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'NODE_PERF_RESTART_TIME_EPOCH')) + + # Container Perf + # container cpu limits + query = constants.CONTAINER_PERF_CPU_LIMITS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_LIMITS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_LIMITS')) + + # container memory limits + query = constants.CONTAINER_PERF_MEMORY_LIMITS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_LIMITS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_LIMITS')) + + # cpu requests + query = constants.CONTAINER_PERF_CPU_REQUESTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_REQUESTS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_REQUESTS')) + + # memory requests + query = constants.CONTAINER_PERF_MEMORY_REQUESTS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_REQUESTS_QUERY')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_REQUESTS')) + + # cpu usage + query = constants.CONTAINER_PERF_CPU_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_CPU_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_CPU_USAGE')) + + # memory rss usage + query = constants.CONTAINER_PERF_MEMORY_RSS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_RSS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_RSS_USAGE')) + + # memory ws usage + query = constants.CONTAINER_PERF_MEMORY_WS_USAGE_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_MEMORY_WS_USAGE')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_MEMORY_WS_USAGE')) + + # restart time epoch + query = constants.CONTAINER_PERF_RESTART_TIME_EPOCH_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_PERF_RESTART_TIME_EPOCH')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_PERF_RESTART_TIME_EPOCH')) + + # Container log + query = constants.CONTAINER_LOG_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('CONTAINER_LOG')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'CONTAINER_LOG')) + + # InsightsMetrics + query = constants.INSIGHTS_METRICS_QUERY.format(queryTimeInterval) + params = { 'query': query} + result = requests.get(queryUrl, params=params, headers=Headers, verify=False) + if not result: + pytest.fail("log analytics query response shouldnt be null or empty for workflow: {0}".format('INSIGHTS_METRICS')) + + rowCount = result.json()['tables'][0]['rows'][0][0] + if not rowCount: + pytest.fail("rowCount should be greater than for cluster: {0} for workflow: {1} ".format(clusterResourceId, 'INSIGHTS_METRICS')) + + append_result_output("test_e2e_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed e2e workflows test.") diff --git a/test/e2e/src/tests/test_node_metrics_e2e_workflow.py b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py new file mode 100755 index 000000000..4346f89a8 --- /dev/null +++ b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py @@ -0,0 +1,420 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output +from datetime import datetime, timedelta + +pytestmark = pytest.mark.agentests + +# validation of node metrics e2e workflow +def test_node_metrics_e2e_workflow(env_dict): + print("Starting node metrics e2e workflow test.") + append_result_output("test_node_metrics_e2e_workflow start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for metric queries + metricQueryIntervalInMins = env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not metricQueryIntervalInMins: + pytest.fail( + "DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty or 0") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail( + "environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail( + "failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for metric queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get( + 'activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resourceManager = env_dict.get('AZURE_ENDPOINTS').get('resourceManager') + aad_token = fetch_aad_token( + client_id, client_secret, authority_uri, resourceManager) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate metrics e2e workflow + now = datetime.utcnow() + endtime = now.isoformat()[:-3]+'Z' + starttime = (now - timedelta(hours=0, + minutes=constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES)).isoformat()[:-3]+'Z' + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json", + "content-length": "0" + } + params = {} + # node metric - memoryRssBytes + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_RSS_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_RSS_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryRssPercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryWorkingSetBytes + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_WS_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_WS_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORYE_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - memoryWorkingSetPercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - cpuUsageMilliCores + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - cpuUsagePercentage + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + # node metric - nodesCount + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.NODE_COUNT_METRIC_NAME, + constants.NODE_METRIC_METRIC_AGGREGATION, + constants.NODE_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail("response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.NODE_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.NODE_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.NODE_COUNT_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_COUNT_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + + append_result_output("test_node_metrics_e2e_workflow end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed node metrics e2e workflow test.") diff --git a/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py new file mode 100755 index 000000000..cd4260f76 --- /dev/null +++ b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py @@ -0,0 +1,134 @@ +import pytest +import constants +import requests + +from arm_rest_utility import fetch_aad_token +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list +from results_utility import append_result_output +from datetime import datetime, timedelta + +pytestmark = pytest.mark.agentests + +# validation of pod metrics e2e workflows +def test_pod_metrics_e2e_workflow(env_dict): + print("Starting pod metrics e2e workflows test.") + append_result_output("test_pod_metrics_e2e_workflow start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # query time interval for metrics queries + metricQueryIntervalInMins = env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] + if not metricQueryIntervalInMins: + pytest.fail( + "DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES should not be null or empty or 0") + + # get the cluster resource id from replicaset pod envvars + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + envVars = pod_list.items[0].spec.containers[0].env + if not envVars: + pytest.fail( + "environment variables should be defined in the replicaset pod") + + clusterResourceId = '' + for env in envVars: + if env.name == "AKS_RESOURCE_ID": + clusterResourceId = env.value + print("cluster resource id: {}".format(clusterResourceId)) + + if not clusterResourceId: + pytest.fail( + "failed to get clusterResourceId from replicaset pod environment variables") + + # fetch AAD token for metrics queries + tenant_id = env_dict.get('TENANT_ID') + authority_uri = env_dict.get('AZURE_ENDPOINTS').get( + 'activeDirectory') + tenant_id + client_id = env_dict.get('CLIENT_ID') + client_secret = env_dict.get('CLIENT_SECRET') + resourceManager = env_dict.get('AZURE_ENDPOINTS').get('resourceManager') + aad_token = fetch_aad_token( + client_id, client_secret, authority_uri, resourceManager) + if not aad_token: + pytest.fail("failed to fetch AAD token") + + access_token = aad_token.get('accessToken') + if not access_token: + pytest.fail("access_token shouldnt be null or empty") + + # validate metrics e2e workflow + now = datetime.utcnow() + endtime = now.isoformat()[:-3]+'Z' + starttime = (now - timedelta(hours=0, + minutes=constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES)).isoformat()[:-3]+'Z' + Headers = { + "Authorization": str("Bearer " + access_token), + "Content-Type": "application/json", + "content-length": "0" + } + params = {} + # pod metric - PodCount + custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( + resourceManager.rstrip("/"), + clusterResourceId, + starttime, + endtime, + constants.POD_COUNT_METRIC_NAME, + constants.POD_METRIC_METRIC_AGGREGATION, + constants.POD_METRICS_NAMESPACE, + constants.METRICS_API_VERSION) + + response = requests.get(custommetricsUrl, params=params, + headers=Headers, verify=False) + + if not response: + pytest.fail( + "response of the metrics query API shouldnt be null or empty") + + if response.status_code != 200: + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) + + responseJSON = response.json() + if not responseJSON: + pytest.fail("response JSON shouldnt be null or empty") + + namespace = responseJSON['namespace'] + if namespace != constants.POD_METRICS_NAMESPACE: + pytest.fail("got the namespace: {0} but expected namespace:{1} in the response".format( + namespace, constants.POD_METRICS_NAMESPACE)) + + responseValues = responseJSON['value'] + if not responseValues: + pytest.fail("response JSON shouldnt be null or empty") + + if len(responseValues) <= 0: + pytest.fail("length of value array in the response should be greater than 0") + + for responseVal in responseValues: + metricName = responseVal['name']['value'] + if metricName != constants.POD_COUNT_METRIC_NAME: + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.POD_COUNT_METRIC_NAME)) + timeseries = responseVal['timeseries'] + if not timeseries: + pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( + constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) + if len(timeseries) <= 0: + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) + + append_result_output("test_pod_metrics_e2e_workflow end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed e2e workflows test.") diff --git a/test/e2e/src/tests/test_resource_status.py b/test/e2e/src/tests/test_resource_status.py new file mode 100755 index 000000000..bb63dac7c --- /dev/null +++ b/test/e2e/src/tests/test_resource_status.py @@ -0,0 +1,43 @@ +import pytest +import constants + +from kubernetes import client, config +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status + +pytestmark = pytest.mark.agentests + +# validate all the critical resources such as ds, rs, ds pods and rs pod etc. are up and running +def test_resource_status(env_dict): + print("Starting resource status check.") + append_result_output("test_resource_status start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + #config.load_kube_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + # checking the deployment status + check_kubernetes_deployment_status( + constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DEPLOYMENT_NAME, env_dict['TEST_AGENT_LOG_FILE']) + + # checking the daemonset status + check_kubernetes_daemonset_status( + constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DAEMONSET_NAME, env_dict['TEST_AGENT_LOG_FILE']) + + expectedPodRestartCount = env_dict['AGENT_POD_EXPECTED_RESTART_COUNT'] + # checking deployment pod status + check_kubernetes_pods_status(constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR, expectedPodRestartCount, env_dict['TEST_AGENT_LOG_FILE']) + + # checking daemonset pod status + check_kubernetes_pods_status(constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DAEMON_SET_PODS_LABEL_SELECTOR, expectedPodRestartCount, env_dict['TEST_AGENT_LOG_FILE']) + + append_result_output("test_resource_status end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully checked resource status check.") diff --git a/test/e2e/src/tests/test_rs_workflows.py b/test/e2e/src/tests/test_rs_workflows.py new file mode 100755 index 000000000..aef422171 --- /dev/null +++ b/test/e2e/src/tests/test_rs_workflows.py @@ -0,0 +1,93 @@ +import pytest +import constants + +from kubernetes import client, config +from kubernetes_pod_utility import get_pod_list, get_log_file_content +from results_utility import append_result_output +from helper import check_kubernetes_deployment_status +from helper import check_kubernetes_daemonset_status +from helper import check_kubernetes_pods_status +from kubernetes.stream import stream + +pytestmark = pytest.mark.agentests + +# validation of replicaset agent workflows +def test_rs_workflows(env_dict): + print("Starting replicaset agent workflows test.") + append_result_output("test_rs_workflows start \n", + env_dict['TEST_AGENT_LOG_FILE']) + # Loading in-cluster kube-config + try: + config.load_incluster_config() + except Exception as e: + pytest.fail("Error loading the in-cluster config: " + str(e)) + + print("getting pod list") + api_instance = client.CoreV1Api() + pod_list = get_pod_list(api_instance, constants.AGENT_RESOURCES_NAMESPACE, + constants.AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR) + if not pod_list: + pytest.fail("pod_list shouldnt be null or empty") + + if len(pod_list.items) <= 0: + pytest.fail("number of items in pod list should be greater than 0") + + rspodName = pod_list.items[0].metadata.name + if not rspodName: + pytest.fail("replicaset pod name should not be null or empty") + + logcontent = get_log_file_content( + api_instance, constants.AGENT_RESOURCES_NAMESPACE, rspodName, constants.AGENT_OMSAGENT_LOG_PATH) + if not logcontent: + pytest.fail("logcontent should not be null or empty for rs pod: {}".format(rspodName)) + loglines = logcontent.split("\n") + if len(loglines) <= 0: + pytest.fail("number of log lines should be greater than 0") + + IsKubePodInventorySuccessful = False + IsKubeNodeInventorySuccessful = False + IsKubeDeploymentInventorySuccessful = False + IsKubeContainerPerfInventorySuccessful = False + IsKubeServicesInventorySuccessful = False + IsContainerNodeInventorySuccessful = False + IsKubeEventsSuccessful = False + for line in loglines: + if line.find(constants.KUBE_POD_INVENTORY_EMIT_STREAM) >= 0: + IsKubePodInventorySuccessful = True + if line.find(constants.KUBE_NODE_INVENTORY_EMIT_STREAM) >= 0: + IsKubeNodeInventorySuccessful = True + if line.find(constants.KUBE_DEPLOYMENT_INVENTORY_EMIT_STREAM) >= 0: + IsKubeDeploymentInventorySuccessful = True + if line.find(constants.KUBE_CONTAINER_PERF_EMIT_STREAM) >= 0: + IsKubeContainerPerfInventorySuccessful = True + if line.find(constants.KUBE_SERVICES_EMIT_STREAM) >= 0: + IsKubeServicesInventorySuccessful = True + if line.find(constants.KUBE_CONTAINER_NODE_INVENTORY_EMIT_STREAM) >= 0: + IsContainerNodeInventorySuccessful = True + if line.find(constants.KUBE_EVENTS_EMIT_STREAM) >= 0: + IsKubeEventsSuccessful = True + + if IsKubePodInventorySuccessful == False: + pytest.fail("KubePodInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeNodeInventorySuccessful == False: + pytest.fail("KubeNodeInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeDeploymentInventorySuccessful == False: + pytest.fail("KubeDeploymentInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeContainerPerfInventorySuccessful == False: + pytest.fail("KubeContainerPerfInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeServicesInventorySuccessful == False: + pytest.fail("KubeServicesInventory stream not emitted successfully from pod:" + rspodName) + + if IsContainerNodeInventorySuccessful == False: + pytest.fail("ContainerNodeInventory stream not emitted successfully from pod:" + rspodName) + + if IsKubeEventsSuccessful == False: + pytest.fail("KubeEventsInventory stream not emitted successfully from rs pod:" + rspodName) + + append_result_output("test_rs_workflows end \n", + env_dict['TEST_AGENT_LOG_FILE']) + print("Successfully completed replicaset workflows test.") From 91f954f07ee7552673915570f53987457b7dcfc4 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 25 Feb 2021 09:08:28 -0800 Subject: [PATCH 075/194] scrape new kubelet pod count metric name (#508) --- build/linux/installer/conf/telegraf.conf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 202ac9741..5a5bb2d8c 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -675,7 +675,9 @@ ## An array of urls to scrape metrics from. urls = ["$CADVISOR_METRICS_URL"] - fieldpass = ["kubelet_running_pod_count","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] + # <= 1.18: metric name is kubelet_running_pod_count + # >= 1.19: metric name changed to kubelet_running_pods + fieldpass = ["kubelet_running_pod_count","kubelet_running_pods","volume_manager_total_volumes", "kubelet_node_config_error", "process_resident_memory_bytes", "process_cpu_seconds_total"] metric_version = 2 url_tag = "scrapeUrl" From 4a8ff2328210ace141834b0cacb616dfcee801e7 Mon Sep 17 00:00:00 2001 From: Nicolas Yuen Date: Sun, 21 Mar 2021 03:45:59 +0800 Subject: [PATCH 076/194] Adding explicit json output to az commands as the script fails if az is configured with Table output #409 (#513) --- scripts/onboarding/managed/disable-monitoring.sh | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 10 +++++----- scripts/onboarding/managed/upgrade-monitoring.sh | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/onboarding/managed/disable-monitoring.sh b/scripts/onboarding/managed/disable-monitoring.sh index d43a79f51..29b755331 100644 --- a/scripts/onboarding/managed/disable-monitoring.sh +++ b/scripts/onboarding/managed/disable-monitoring.sh @@ -127,7 +127,7 @@ remove_monitoring_tags() # validate cluster identity for Azure Arc enabled Kubernetes cluster if [ "$isArcK8sCluster" = true ] ; then - identitytype=$(az resource show -g ${clusterResourceGroup} -n ${clusterName} --resource-type $resourceProvider --query identity.type) + identitytype=$(az resource show -g ${clusterResourceGroup} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 9d0c0aca5..1162ba0d3 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -339,7 +339,7 @@ validate_cluster_identity() { local rgName="$(echo ${1})" local clusterName="$(echo ${2})" - local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type) + local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') echo "cluster identity type:" $identitytype @@ -454,7 +454,7 @@ create_default_log_analytics_workspace() { echo "using existing default workspace:"$workspaceName fi - workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id) + workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id -o json) workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') echo "workspace resource Id: ${workspaceResourceId}" } @@ -477,12 +477,12 @@ get_workspace_guid_and_key() { local wsName="$(echo ${resourceId} | cut -d'/' -f9)" # get the workspace guid - workspaceGuid=$(az resource show -g $rgName -n $wsName --resource-type $workspaceResourceProvider --query properties.customerId) + workspaceGuid=$(az resource show -g $rgName -n $wsName --resource-type $workspaceResourceProvider --query properties.customerId -o json) workspaceGuid=$(echo $workspaceGuid | tr -d '"') echo "workspaceGuid:"$workspaceGuid echo "getting workspace primaryshared key" - workspaceKey=$(az rest --method post --uri $workspaceResourceId/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey) + workspaceKey=$(az rest --method post --uri $workspaceResourceId/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) workspaceKey=$(echo $workspaceKey | tr -d '"') } @@ -621,7 +621,7 @@ else set_azure_subscription $workspaceSubscriptionId fi - workspaceRegion=$(az resource show --ids ${workspaceResourceId} --query location) + workspaceRegion=$(az resource show --ids ${workspaceResourceId} --query location -o json) workspaceRegion=$(echo $workspaceRegion | tr -d '"') echo "Workspace Region:"$workspaceRegion fi diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 6d14dfa5f..e54822f74 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -202,7 +202,7 @@ validate_cluster_identity() { local rgName="$(echo ${1})" local clusterName="$(echo ${2})" - local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type) + local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') echo "cluster identity type:" $identitytype @@ -216,7 +216,7 @@ validate_cluster_identity() { validate_monitoring_tags() { echo "get loganalyticsworkspaceResourceId tag on to cluster resource" - logAnalyticsWorkspaceResourceIdTag=$(az resource show --query tags.logAnalyticsWorkspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider) + logAnalyticsWorkspaceResourceIdTag=$(az resource show --query tags.logAnalyticsWorkspaceResourceId -g $clusterResourceGroup -n $clusterName --resource-type $resourceProvider -o json) echo "configured log analytics workspace: ${logAnalyticsWorkspaceResourceIdTag}" echo "successfully got logAnalyticsWorkspaceResourceId tag on the cluster resource" if [ -z "$logAnalyticsWorkspaceResourceIdTag" ]; then From 512e5c0df258d67ba1c15c49d3650529a61ec9aa Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 22 Mar 2021 11:08:49 -0700 Subject: [PATCH 077/194] Gangams/arc proxy contract and token renewal updates (#511) * fix issue with crd status updates * handle renewal token delays * add proxy contract * updates for proxy cert for linux * remove proxycert related changes * fix whitespace issue * fix whitespace issue * remove proxy in arm template --- .../templates/omsagent-deployment.yaml | 2 +- .../templates/omsagent-rbac.yaml | 2 +- .../templates/omsagent-secret.yaml | 14 +++++- charts/azuremonitor-containers/values.yaml | 6 +++ .../existingClusterOnboarding.json | 12 +---- .../existingClusterParam.json | 3 -- .../plugins/ruby/arc_k8s_cluster_identity.rb | 45 ++++++++++++++----- 7 files changed, 56 insertions(+), 28 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 012dd2720..37b8faacc 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -157,5 +157,5 @@ spec: - name: omsagent-adx-secret secret: secretName: omsagent-adx-secret - optional: true + optional: true {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index 5db5c2dab..c0a6e3722 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -28,7 +28,7 @@ rules: resources: ["healthstates"] verbs: ["get", "create", "patch"] - apiGroups: ["clusterconfig.azure.com"] - resources: ["azureclusteridentityrequests"] + resources: ["azureclusteridentityrequests", "azureclusteridentityrequests/status"] resourceNames: ["container-insights-clusteridentityrequest"] verbs: ["get", "create", "patch"] - nonResourceURLs: ["/metrics"] diff --git a/charts/azuremonitor-containers/templates/omsagent-secret.yaml b/charts/azuremonitor-containers/templates/omsagent-secret.yaml index 1a7f087ed..8c245338c 100644 --- a/charts/azuremonitor-containers/templates/omsagent-secret.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-secret.yaml @@ -13,7 +13,19 @@ data: WSID: {{ required "A valid workspace id is required!" .Values.omsagent.secret.wsid | b64enc | quote }} KEY: {{ required "A valid workspace key is required!" .Values.omsagent.secret.key | b64enc | quote }} DOMAIN: {{ .Values.omsagent.domain | b64enc | quote }} - {{- if ne .Values.omsagent.proxy "" }} + {{- $httpsProxyDict := urlParse .Values.Azure.proxySettings.httpsProxy -}} + {{- $httpProxyDict := urlParse .Values.Azure.proxySettings.httpProxy -}} + {{- if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) ($httpsProxyDict.userinfo) }} + PROXY: {{ .Values.Azure.proxySettings.httpsProxy | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpsProxy)) (empty $httpsProxyDict.userinfo) }} + # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth + PROXY: {{ urlJoin (dict "scheme" $httpsProxyDict.scheme "userinfo" "admin:secret" "host" $httpsProxyDict.host) | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) ($httpProxyDict.userinfo) }} + PROXY: {{ .Values.Azure.proxySettings.httpProxy | b64enc | quote }} + {{- else if and (and (.Values.Azure.proxySettings.isProxyEnabled) (.Values.Azure.proxySettings.httpProxy)) (empty $httpProxyDict.userinfo) }} + # adding arbitrary creds since omsagent expects arbitrary creds in case of no auth + PROXY: {{ urlJoin (dict "scheme" $httpProxyDict.scheme "userinfo" "admin:secret" "host" $httpProxyDict.host) | b64enc | quote }} + {{- else if ne .Values.omsagent.proxy "" }} PROXY: {{ .Values.omsagent.proxy | b64enc | quote }} {{- end }} {{- end }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 5831c9889..caf0217c3 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -12,6 +12,12 @@ Azure: Extension: Name: "" ResourceId: "" + proxySettings: + isProxyEnabled: false + httpProxy: "" + httpsProxy: "" + noProxy: "" + proxyCert: "" omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json index 8ebef232a..95e7ba5d0 100644 --- a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -13,14 +13,7 @@ "metadata": { "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" } - }, - "proxyEndpointUrl": { - "type": "string", - "defaultValue": "", - "metadata": { - "description": "If the cluster behind forward proxy, then specify Proxy Endpoint URL in this format: http(s)://:@:" - } - }, + }, "workspaceResourceId": { "type": "string", "metadata": { @@ -114,8 +107,7 @@ }, "configurationProtectedSettings": { "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", - "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" , - "omsagent.proxy": "[if(equals(parameters('proxyEndpointUrl'), ''), '', parameters('proxyEndpointUrl'))]" + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" }, "autoUpgradeMinorVersion": true, "releaseTrain": "Stable", diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json index b74b5ac95..6829d3d05 100644 --- a/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json @@ -8,9 +8,6 @@ "clusterRegion": { "value": "" }, - "proxyEndpointUrl": { - "value": "" - }, "workspaceResourceId": { "value": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/" }, diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb index 7824f3d4e..552dafb1f 100644 --- a/source/plugins/ruby/arc_k8s_cluster_identity.rb +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -26,6 +26,7 @@ def initialize @log.info "initialize start @ #{Time.now.utc.iso8601}" @token_expiry_time = Time.now @cached_access_token = String.new + @isLastTokenRenewalUpdatePending = false @token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @kube_api_server_url = KubernetesApiClient.getKubeAPIServerUrl @@ -41,14 +42,20 @@ def initialize def get_cluster_identity_token() begin - # get the cluster msi identity token either if its empty or near expirty. Token is valid 24 hrs. + # get the cluster msi identity token either if its empty or near expiry. Token is valid 24 hrs. if @cached_access_token.to_s.empty? || (Time.now + 60 * 60 > @token_expiry_time) # Refresh token 1 hr from expiration # renew the token if its near expiry if !@cached_access_token.to_s.empty? && (Time.now + 60 * 60 > @token_expiry_time) - @log.info "renewing the token since its near expiry @ #{Time.now.utc.iso8601}" - renew_near_expiry_token - # sleep 60 seconds to get the renewed token available - sleep 60 + if !@isLastTokenRenewalUpdatePending + @log.info "token expiry - @ #{@token_expiry_time}" + @log.info "renewing the token since token has near expiry @ #{Time.now.utc.iso8601}" + renew_near_expiry_token + # sleep 60 seconds to get the renewed token available + sleep 60 + @isLastTokenRenewalUpdatePending = true + else + @log.warn "last token renewal update still pending @ #{Time.now.utc.iso8601}" + end end @log.info "get token reference from crd @ #{Time.now.utc.iso8601}" tokenReference = get_token_reference_from_crd @@ -61,6 +68,7 @@ def get_cluster_identity_token() token = get_token_from_secret(token_secret_name, token_secret_data_name) if !token.nil? @cached_access_token = token + @isLastTokenRenewalUpdatePending = false else @log.warn "got token nil from secret: #{@token_secret_name}" end @@ -123,7 +131,17 @@ def get_token_reference_from_crd() tokenReference["expirationTime"] = status["expirationTime"] tokenReference["secretName"] = status["tokenReference"]["secretName"] tokenReference["dataName"] = status["tokenReference"]["dataName"] - end + elsif get_response.code.to_i == 404 # this might happen if the crd resource deleted by user accidently + @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" + crd_request_body = get_crd_request_body + crd_request_body_json = crd_request_body.to_json + create_request = Net::HTTP::Post.new(crd_request_uri) + create_request["Content-Type"] = "application/json" + create_request["Authorization"] = "Bearer #{@service_account_token}" + create_request.body = crd_request_body_json + create_response = @http_client.request(create_request) + @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" + end rescue => err @log.warn "get_token_reference_from_crd call failed: #{err}" ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) @@ -141,20 +159,23 @@ def renew_near_expiry_token() cluster_identity_resource_namespace: @@cluster_identity_resource_namespace, cluster_identity_resource_name: @@cluster_identity_resource_name, } - crd_request_body = get_crd_request_body - crd_request_body_json = crd_request_body.to_json - update_request = Net::HTTP::Patch.new(crd_request_uri) + update_crd_request_body = { 'status': {'expirationTime': ''} } + update_crd_request_body_json = update_crd_request_body.to_json + update_crd_request_uri = crd_request_uri + "/status" + update_request = Net::HTTP::Patch.new(update_crd_request_uri) update_request["Content-Type"] = "application/merge-patch+json" update_request["Authorization"] = "Bearer #{@service_account_token}" - update_request.body = crd_request_body_json + update_request.body = update_crd_request_body_json update_response = @http_client.request(update_request) - @log.info "Got response of #{update_response.code} for PATCH #{crd_request_uri} @ #{Time.now.utc.iso8601}" + @log.info "Got response of #{update_response.code} for PATCH #{update_crd_request_uri} @ #{Time.now.utc.iso8601}" if update_response.code.to_i == 404 @log.info "since crd resource doesnt exist hence creating crd resource : #{@@cluster_identity_resource_name} @ #{Time.now.utc.iso8601}" create_request = Net::HTTP::Post.new(crd_request_uri) create_request["Content-Type"] = "application/json" create_request["Authorization"] = "Bearer #{@service_account_token}" - create_request.body = crd_request_body_json + create_crd_request_body = get_crd_request_body + create_crd_request_body_json = create_crd_request_body.to_json + create_request.body = create_crd_request_body_json create_response = @http_client.request(create_request) @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" end From 6b48b6a846184070cdd26cc733b650be4f4fb5ae Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 22 Mar 2021 16:06:47 -0700 Subject: [PATCH 078/194] doc updates for microsoft charts repo release (#512) * doc updates for microsoft charts repo release * wip --- ReleaseProcess.md | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 2a3e6001a..c6f51bb65 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -43,13 +43,47 @@ This needs to be co-ordinated with Red hat and ARO-RP team for the release and Make PR against [AKS-Engine](https://github.com/Azure/aks-engine). Refer PR https://github.com/Azure/aks-engine/pull/2318 -## ARO v4, On-prem K8s, Azure Arc K8s and OpenShift v4 clusters +## ARO v4, Azure Arc K8s and OpenShift v4 clusters Make sure azuremonitor-containers chart yamls updates with all changes going with the release and also make sure to bump the chart version, imagetag and docker provider version etc. Similar to agent container image, build pipeline automatically push the chart to container insights prod acr for canary and prod repos accordingly. Both the agent and helm chart will be replicated to `mcr.microsoft.com`. The way, customers will be onboard the monitoring to these clusters using onboarding scripts under `onboarding\managed` directory so please bump chart version for prod release. Once we move to Arc K8s Monitoring extension Public preview, these will be taken care so at that point of time no manual changes like this required. +## Microsoft Charts Repo release for On-prem K8s + +Since HELM charts repo being deprecated, Microsoft charts repo being used for HELM chart release of on-prem K8s clusters. +To make chart release PR, fork [Microsoft-charts-repo]([https://github.com/microsoft/charts/tree/gh-pages) and make the PR against `gh-pages` branch of the upstream repo. + +Refer PR - https://github.com/microsoft/charts/pull/23 for example. +Once the PR merged, latest version of HELM chart should be available in couple of mins in https://microsoft.github.io/charts/repo and https://artifacthub.io/. + +Instructions to create PR +``` +# 1. create helm package for the release candidate + git clone git@github.com:microsoft/Docker-Provider.git + git checkout ci_prod + cd ~/Docker-Provider/charts/azuremonitor-containers # this path based on where you have cloned the repo + helm package . + +# 2. clone your fork repo and checkout gh_pages branch # gh_pages branch used as release branch + cd ~ + git clone + cd ~/charts # assumed the root dir of the clone is charts + git checkout gh_pages + +# 3. copy release candidate helm package + cd ~/charts/repo/azuremonitor-containers + # update chart version value with the version of chart being released + cp ~/Docker-Provider/charts/azuremonitor-containers/azuremonitor-containers-.tgz . + cd ~/charts/repo + # update repo index file + helm repo index . + +# 4. Review the changes and make PR. Please note, you may need to revert unrelated changes automatically added by `helm repo index .` command + +``` + # 4. Monitor agent roll-out status In Container Insights Agent (AKS) telemetry dashboard, update the agent roll status by region chart with released agent image and track rollout status. If you see any issues with agent rollout, reach out AKS on-call team for the help on investigation and understanding whats going on. From d93c680db71c7a562054faaa0854addb53c023a2 Mon Sep 17 00:00:00 2001 From: seenu433 Date: Mon, 22 Mar 2021 20:23:11 -0400 Subject: [PATCH 079/194] Update enable-monitoring.sh (#514) Line 314 and 343 seems to have trailing spaces for some subscriptions which is exiting the script even for valid scenarios Co-authored-by: Ganga Mahesh Siddem --- scripts/onboarding/managed/enable-monitoring.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 1162ba0d3..a9560b5c5 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -311,7 +311,7 @@ parse_args() { validate_and_configure_supported_cloud() { echo "get active azure cloud name configured to azure cli" - azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]") + azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") echo "active azure cloud name configured to azure cli: ${azureCloudName}" if [ "$isArcK8sCluster" = true ]; then if [ "$azureCloudName" != "azurecloud" -a "$azureCloudName" != "azureusgovernment" ]; then @@ -340,7 +340,7 @@ validate_cluster_identity() { local clusterName="$(echo ${2})" local identitytype=$(az resource show -g ${rgName} -n ${clusterName} --resource-type $resourceProvider --query identity.type -o json) - identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"') + identitytype=$(echo $identitytype | tr "[:upper:]" "[:lower:]" | tr -d '"' | tr -d "[:space:]") echo "cluster identity type:" $identitytype if [[ "$identitytype" != "systemassigned" ]]; then From 4d386ce2b8c2b6acd56156150a623b8aabd1878c Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 25 Mar 2021 16:01:54 -0700 Subject: [PATCH 080/194] Prometheus scraping from sidecar and OSM changes (#515) --- README.md | 1 + .../scripts/tomlparser-prom-customconfig.rb | 423 ++++++++++++++++++ .../installer/conf/prometheus-side-car.conf | 4 + .../conf/td-agent-bit-prom-side-car.conf | 28 ++ .../conf/telegraf-prom-side-car.conf | 162 +++++++ build/linux/installer/conf/telegraf-rs.conf | 23 +- .../installer/datafiles/base_container.data | 28 +- .../linux/installer/scripts/livenessprobe.sh | 27 +- .../scripts/tomlparser-osm-config.rb | 168 +++++++ .../scripts/tomlparser-prom-customconfig.rb | 267 ----------- build/windows/installer/conf/fluent-bit.conf | 9 + build/windows/installer/conf/telegraf.conf | 162 +++++++ .../templates/omsagent-daemonset-windows.yaml | 6 + .../templates/omsagent-deployment.yaml | 13 +- kubernetes/container-azm-ms-agentconfig.yaml | 11 + kubernetes/container-azm-ms-osmconfig.yaml | 17 + kubernetes/linux/Dockerfile | 2 +- kubernetes/linux/defaultpromenvvariables-rs | 19 +- .../linux/defaultpromenvvariables-sidecar | 9 + kubernetes/linux/main.sh | 209 ++++++--- kubernetes/linux/setup.sh | 8 +- kubernetes/omsagent.yaml | 72 +++ kubernetes/windows/Dockerfile | 4 + kubernetes/windows/main.ps1 | 113 +++-- .../setdefaulttelegrafenvvariables.ps1 | 17 + kubernetes/windows/setup.ps1 | 16 + .../windows/install-build-pre-requisites.ps1 | 6 +- source/plugins/go/src/oms.go | 2 +- source/plugins/go/src/telemetry.go | 157 +++++-- source/plugins/ruby/in_kube_nodes.rb | 6 + 30 files changed, 1548 insertions(+), 441 deletions(-) create mode 100644 build/common/installer/scripts/tomlparser-prom-customconfig.rb create mode 100644 build/linux/installer/conf/prometheus-side-car.conf create mode 100644 build/linux/installer/conf/td-agent-bit-prom-side-car.conf create mode 100644 build/linux/installer/conf/telegraf-prom-side-car.conf create mode 100644 build/linux/installer/scripts/tomlparser-osm-config.rb delete mode 100644 build/linux/installer/scripts/tomlparser-prom-customconfig.rb create mode 100644 build/windows/installer/conf/telegraf.conf create mode 100644 kubernetes/container-azm-ms-osmconfig.yaml create mode 100644 kubernetes/linux/defaultpromenvvariables-sidecar create mode 100644 kubernetes/windows/setdefaulttelegrafenvvariables.ps1 diff --git a/README.md b/README.md index 3564345ee..555234c61 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ The general directory structure is: │ │ ├── acrworkflows/ - acr work flows for the Linux Agent container image │ │ ├── defaultpromenvvariables - default environment variables for Prometheus scraping │ │ ├── defaultpromenvvariables-rs - cluster level default environment variables for Prometheus scraping +│ │ ├── defaultpromenvvariables-sidecar - cluster level default environment variables for Prometheus scraping in sidecar │ ├── windows/ - scripts to build the Docker image for Windows Agent │ │ ├── dockerbuild - script to build the code and docker imag, and publish docker image │ │ ├── acrworkflows/ - acr work flows for the Windows Agent container image diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb new file mode 100644 index 000000000..819c1956f --- /dev/null +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -0,0 +1,423 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end +# require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require "fileutils" + +@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" +@replicaset = "replicaset" +@daemonset = "daemonset" +@promSideCar = "prometheussidecar" +@windows = "windows" +@configSchemaVersion = "" +@defaultDsInterval = "1m" +@defaultDsPromUrls = [] +@defaultDsFieldPass = [] +@defaultDsFieldDrop = [] +@defaultRsInterval = "1m" +@defaultRsPromUrls = [] +@defaultRsFieldPass = [] +@defaultRsFieldDrop = [] +@defaultRsK8sServices = [] +# @defaultRsMonitorPods = false +@defaultCustomPrometheusInterval = "1m" +@defaultCustomPrometheusFieldPass = [] +@defaultCustomPrometheusFieldDrop = [] +@defaultCustomPrometheusMonitorPods = false +@defaultCustomPrometheusLabelSelectors = "" +@defaultCustomPrometheusFieldSelectors = "" + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Checking to see if this is the daemonset or replicaset to parse config accordingly +@controller = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@promConfigMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" + parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted prometheus config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +def checkForType(variable, varType) + if variable.nil? || variable.kind_of?(varType) + return true + else + return false + end +end + +def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with no namespace filters" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", ("pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", ("kubernetes_label_selector = \"#{kubernetesLabelSelectors}\"")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", ("kubernetes_field_selector = \"#{kubernetesFieldSelectors}\"")) + rescue => errorStr + puts "Exception while replacing default pod monitor settings for custom prometheus scraping: #{errorStr}" + end + return new_contents +end + +def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + begin + puts "config::Starting to substitute the placeholders in telegraf conf copy file with namespace filters" + + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR") + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE") + + pluginConfigsWithNamespaces = "" + monitorKubernetesPodsNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + interval = \"#{interval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controller.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + kubernetes_label_selector = \"#{kubernetesLabelSelectors}\" + kubernetes_field_selector = \"#{kubernetesFieldSelectors}\" + fieldpass = #{fieldPassSetting} + fielddrop = #{fieldDropSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) + return new_contents + rescue => errorStr + puts "Exception while creating prometheus input plugins to filter namespaces for custom prometheus: #{errorStr}, using defaults" + replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + if !@controller.nil? + if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? + if @controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus replicaset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] + kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] + + # Remove below 4 lines after phased rollout + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datatypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(kubernetesServices, String) && + checkForTypeArray(urls, String) && + # Remove below check after phased rollout + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) # Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for replicaset" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultRsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop + kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices + urls = (urls.nil?) ? @defaultRsPromUrls : urls + # Remove below lines after phased rollout + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + file_name = "/opt/telegraf-test-rs.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + end + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") + file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") + # Remove below block after phased rollout + if (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && (@sidecarScrapingEnabled.casecmp("false") == 0))) + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + end + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for replicaset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") + setRsPromDefaults + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && + ((!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) || + (!@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0) && @sidecarScrapingEnabled.strip.casecmp("true") == 0) && + !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? + #Get prometheus custom config settings for monitor kubernetes pods + begin + interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] + monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] + monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] + kubernetesLabelSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_label_selector] + kubernetesFieldSelectors = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_field_selector] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForType(kubernetesLabelSelectors, String) && + checkForType(kubernetesFieldSelectors, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby + puts "config::Successfully passed typecheck for config settings for custom prometheus scraping" + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultCustomPrometheusInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultCustomPrometheusFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultCustomPrometheusFieldDrop : fieldDrop + monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultCustomPrometheusMonitorPods : monitorKubernetesPods + kubernetesLabelSelectors = (kubernetesLabelSelectors.nil?) ? @defaultCustomPrometheusLabelSelectors : kubernetesLabelSelectors + kubernetesFieldSelectors = (kubernetesFieldSelectors.nil?) ? @defaultCustomPrometheusFieldSelectors : kubernetesFieldSelectors + + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + file_name = "/etc/telegraf/telegraf.conf" + else + file_name = "/opt/telegraf-test-prom-side-car.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf", file_name) + end + puts "config::Starting to substitute the placeholders in telegraf conf copy file for linux or conf file for windows for custom prometheus scraping" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", interval) + fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", fieldPassSetting) + fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" + new_contents = new_contents.gsub("$AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", fieldDropSetting) + + # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + monitorKubernetesPodsNSConfig = [] + if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (monitorKubernetesPodsNamespaces.length > 0) + new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length + monitorKubernetesPodsNSConfig = monitorKubernetesPodsNamespaces + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + else + new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods, kubernetesLabelSelectors, kubernetesFieldSelectors) + monitorKubernetesPodsNamespacesLength = 0 + end + + # Label and field selectors are passed as strings. For field selectors, split by commas to get the number of key-value pairs. + # Label selectors can be formatted as "app in (app1, app2, app3)", so split by commas only outside parentheses to get the number of key-value pairs. + kubernetesLabelSelectorsLength = kubernetesLabelSelectors.split(/,\s*(?=[^()]*(?:\(|$))/).length + kubernetesFieldSelectorsLength = kubernetesFieldSelectors.split(",").length + + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" + #Set environment variables for telemetry in the sidecar container + if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") + + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for prometheus sidecar" + end + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for prometheus side car, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for promethues side car: #{errorStr}, using defaults") + puts "****************End Prometheus Config Processing********************" + end + elsif @controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? + #Get prometheus daemonset custom config settings + begin + interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] + fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] + fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] + urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] + + # Check for the right datattypes to enforce right setting values + if checkForType(interval, String) && + checkForTypeArray(fieldPass, String) && + checkForTypeArray(fieldDrop, String) && + checkForTypeArray(urls, String) + puts "config::Successfully passed typecheck for config settings for daemonset" + + #if setting is nil assign default values + interval = (interval.nil?) ? @defaultDsInterval : interval + fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass + fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop + urls = (urls.nil?) ? @defaultDsPromUrls : urls + + file_name = "/opt/telegraf-test.conf" + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) + + puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" + #Replace the placeholder config values with values from custom config + text = File.read(file_name) + new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) + new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) + File.open(file_name, "w") { |file| file.puts new_contents } + puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" + + #Set environment variables for telemetry + file = File.open("telemetry_prom_config_env_var", "w") + if !file.nil? + file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") + #Setting array lengths as environment variables for telemetry purposes + file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") + file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") + file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") + # Close file after writing all environment variables + file.close + puts "config::Successfully created telemetry file for daemonset" + end + else + ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") + end # end of type check condition + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") + puts "****************End Prometheus Config Processing********************" + end + end # end of controller type check + end + else + ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Prometheus Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@promConfigMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") + else + puts "config::No configmap mounted for prometheus custom config, using defaults" + end +end +puts "****************End Prometheus Config Processing********************" diff --git a/build/linux/installer/conf/prometheus-side-car.conf b/build/linux/installer/conf/prometheus-side-car.conf new file mode 100644 index 000000000..fd40910d9 --- /dev/null +++ b/build/linux/installer/conf/prometheus-side-car.conf @@ -0,0 +1,4 @@ + + + + diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf new file mode 100644 index 000000000..720f54820 --- /dev/null +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -0,0 +1,28 @@ +[SERVICE] + #Default service flush interval is 15 seconds + Flush 15 + HTTP_Server Off + Daemon Off + storage.path /var/opt/microsoft/docker-cimprov/state/flbstore/ + storage.sync normal + storage.checksum off + storage.backlog.mem_limit 10M + Log_Level info + Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf + Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 1m + Buffer_Size 1m + Mem_Buf_Limit 20m + +[OUTPUT] + Name oms + EnableTelemetry true + Retry_Limit 10 + TelemetryPushIntervalSeconds 300 + Match oms.container.* \ No newline at end of file diff --git a/build/linux/installer/conf/telegraf-prom-side-car.conf b/build/linux/installer/conf/telegraf-prom-side-car.conf new file mode 100644 index 000000000..b3b4ba1d3 --- /dev/null +++ b/build/linux/installer/conf/telegraf-prom-side-car.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 3000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 60000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index d81196330..ee1cf8819 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -540,13 +540,13 @@ #Prometheus Custom Metrics [[inputs.prometheus]] - interval = "$AZMON_RS_PROM_INTERVAL" + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" ## An array of urls to scrape metrics from. - urls = $AZMON_RS_PROM_URLS + urls = $AZMON_TELEGRAF_CUSTOM_PROM_URLS ## An array of Kubernetes services to scrape metrics from. - kubernetes_services = $AZMON_RS_PROM_K8S_SERVICES + kubernetes_services = $AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES ## Scrape Kubernetes pods for the following prometheus annotations: ## - prometheus.io/scrape: Enable scraping for this pod @@ -554,10 +554,15 @@ ## set this to `https` & most likely set the tls config. ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. ## - prometheus.io/port: If port is not 9102 use this annotation - $AZMON_RS_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS - fieldpass = $AZMON_RS_PROM_FIELDPASS - fielddrop = $AZMON_RS_PROM_FIELDDROP + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP metric_version = 2 url_tag = "scrapeUrl" @@ -581,7 +586,11 @@ insecure_skip_verify = true #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] -$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER + +## OSM Prometheus configuration +$AZMON_TELEGRAF_OSM_PROM_PLUGINS + # [[inputs.exec]] # ## Commands array # interval = "15m" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index c680f0eea..df8fbc3da 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -110,24 +110,28 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlrb/string_utils.rb; source/toml-parser/tomlrb/string_utils.rb; 644; root; root /opt/tomlrb/version.rb; source/toml-parser/tomlrb/version.rb; 644; root; root -/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root -/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root -/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root -/opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root -/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root +/opt/td-agent-bit/bin/out_oms.so; intermediate/${{BUILD_CONFIGURATION}}/out_oms.so; 755; root; root +/etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf; build/linux/installer/conf/prometheus-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf; build/linux/installer/conf/td-agent-bit-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf; build/linux/installer/conf/telegraf-prom-side-car.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root +/opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root +/opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root +/opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root /opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root /opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root +/opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root /opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index e3f9fb475..a82fa28eb 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -26,15 +26,22 @@ then exit 1 fi -if [ ! -s "inotifyoutput.txt" ] +if [ -s "inotifyoutput.txt" ] then - # inotifyoutput file is empty and the grep commands for omsagent and td-agent-bit succeeded - exit 0 -else - if [ -s "inotifyoutput.txt" ] - then - # inotifyoutput file has data(config map was applied) - echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log - exit 1 - fi + # inotifyoutput file has data(config map was applied) + echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log + exit 1 fi + +# Perform the following check only for prometheus sidecar that does OSM scraping or for replicaset when sidecar scraping is disabled +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( ( ! -z "${SIDECAR_SCRAPING_ENABLED}" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ) ]]; then + if [ -s "inotifyoutput-osm.txt" ] + then + # inotifyoutput-osm file has data(config map was applied) + echo "inotifyoutput-osm.txt has been updated - config changed" > /dev/termination-log + exit 1 + fi +fi + +exit 0 diff --git a/build/linux/installer/scripts/tomlparser-osm-config.rb b/build/linux/installer/scripts/tomlparser-osm-config.rb new file mode 100644 index 000000000..096064db8 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-osm-config.rb @@ -0,0 +1,168 @@ +#!/usr/local/bin/ruby + +require_relative "tomlrb" +require "fileutils" +require_relative "ConfigParseErrorLogger" + +@controllerType = ENV["CONTROLLER_TYPE"] +@containerType = ENV["CONTAINER_TYPE"] +@sidecarScrapingEnabled = ENV["SIDECAR_SCRAPING_ENABLED"] + +@replicaset = "replicaset" +@prometheusSidecar = "prometheussidecar" + +if !@controllerType.nil? && !@controllerType.empty? && @controllerType.strip.casecmp(@replicaset) == 0 && + (@sidecarScrapingEnabled.nil? || (!@sidecarScrapingEnabled.nil? && !@sidecarScrapingEnabled.empty? && @sidecarScrapingEnabled.strip.casecmp("false") == 0)) + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + @tgfTestConfigFile = "/opt/telegraf-test-rs.conf" +elsif !@containerType.nil? && !@containerType.empty? && @containerType.strip.casecmp(@prometheusSidecar) == 0 + @tgfConfigFile = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +end + +@configMapMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" +@configSchemaVersion = "" +# @tgfConfigFileSidecar = "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" +# @tgfTestConfigFile = "/opt/telegraf-test-prom-side-car.conf" +@osmMetricNamespaces = [] + +#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering +@metricVersion = 2 +@monitorKubernetesPodsVersion = 2 +#@fieldPassSetting = "[\"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq\"]" +@fieldPassSetting = "[\"envoy_cluster_upstream_cx_total\", \"envoy_cluster_upstream_cx_connect_fail\", \"envoy_cluster_upstream_rq\", \"envoy_cluster_upstream_rq_xx\", \"envoy_cluster_upstream_rq_total\", \"envoy_cluster_upstream_rq_time_bucket\", \"envoy_cluster_upstream_cx_rx_bytes_total\", \"envoy_cluster_upstream_cx_tx_bytes_total\", \"envoy_cluster_upstream_cx_active\"]" +@scrapeInterval = "1m" +@urlTag = "scrapeUrl" +@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" +@responseTimeout = "15s" +@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +@insecureSkipVerify = true + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-osmconfig for osm metrics found, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map for osm metrics" + return parsedConfig + else + puts "config::configmap container-azm-ms-osmconfig for osm metrics not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for osm metrics: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +def checkForTypeArray(arrayValue, arrayType) + if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) + return true + else + return false + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && + !parsedConfig[:osm_metric_collection_configuration].nil? && + !parsedConfig[:osm_metric_collection_configuration][:settings].nil? + osmPromMetricNamespaces = parsedConfig[:osm_metric_collection_configuration][:settings][:monitor_namespaces] + puts "config::osm::got:osm_metric_collection_configuration.settings.monitor_namespaces='#{osmPromMetricNamespaces}'" + + # Check to see if osm_metric_collection_configuration.settings has a valid setting for monitor_namespaces to enable scraping for specific namespaces + # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - + # - to use defaults in case of nil settings + if !osmPromMetricNamespaces.nil? && checkForTypeArray(osmPromMetricNamespaces, String) + # Adding a check to see if an empty array is passed for kubernetes namespaces + if (osmPromMetricNamespaces.length > 0) + @osmMetricNamespaces = osmPromMetricNamespaces + end + end + end + rescue => errorStr + puts "config::osm::error:Exception while reading config settings for osm configuration settings - #{errorStr}, using defaults" + @osmMetricNamespaces = [] + end +end + +def replaceOsmTelegrafConfigPlaceHolders + begin + #replace place holders in configuration file + tgfConfig = File.read(@tgfTestConfigFile) #read returns only after closing the file + + if @osmMetricNamespaces.length > 0 + osmPluginConfigsWithNamespaces = "" + @osmMetricNamespaces.each do |namespace| + if !namespace.nil? + #Stripping namespaces to remove leading and trailing whitespaces + namespace.strip! + if namespace.length > 0 + osmPluginConfigsWithNamespaces += "\n[[inputs.prometheus]] + name_prefix=\"container.azm.ms.osm/\" + interval = \"#{@scrapeInterval}\" + monitor_kubernetes_pods = true + pod_scrape_scope = \"#{(@controllerType.casecmp(@replicaset) == 0) ? "cluster" : "node"}\" + monitor_kubernetes_pods_namespace = \"#{namespace}\" + fieldpass = #{@fieldPassSetting} + metric_version = #{@metricVersion} + url_tag = \"#{@urlTag}\" + bearer_token = \"#{@bearerToken}\" + response_timeout = \"#{@responseTimeout}\" + tls_ca = \"#{@tlsCa}\" + insecure_skip_verify = #{@insecureSkipVerify}\n" + end + end + end + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", osmPluginConfigsWithNamespaces) + else + puts "Using defaults for OSM configuration since there was an error in OSM config map or no namespaces were set" + tgfConfig = tgfConfig.gsub("$AZMON_TELEGRAF_OSM_PROM_PLUGINS", "") + end + File.open(@tgfTestConfigFile, "w") { |file| file.puts tgfConfig } # 'file' will be closed here after it goes out of scope + puts "config::osm::Successfully substituted the OSM placeholders in #{@tgfTestConfigFile} file in sidecar container" + rescue => errorStr + # TODO: test this scenario out + puts "config::osm::error:Exception while replacing telegraf configuration settings for osm - #{errorStr}, using defaults" + end +end + +@osmConfigSchemaVersion = ENV["AZMON_OSM_CFG_SCHEMA_VERSION"] +puts "****************Start OSM Config Processing********************" +if !@osmConfigSchemaVersion.nil? && !@osmConfigSchemaVersion.empty? && @osmConfigSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + # Check to see if the prometheus custom config parser has created a test config file so that we can replace the settings in the test file and run it, If not create + # a test config file by copying contents of the actual telegraf config file. + if (!File.exist?(@tgfTestConfigFile)) + # Copy the telegraf config file to a temp file to run telegraf in test mode with this config + puts "test telegraf config file #{@tgfTestConfigFile} does not exist, creating new one" + FileUtils.cp(@tgfConfigFile, @tgfTestConfigFile) + end + + replaceOsmTelegrafConfigPlaceHolders() + + # Write the telemetry to file, so that they can be set as environment variables + telemetryFile = File.open("integration_osm_config_env_var", "w") + + if !telemetryFile.nil? + telemetryFile.write("export TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") + # Close file after writing all environment variables + telemetryFile.close + else + puts "config::osm::Exception while opening file for writing OSM telemetry environment variables" + end + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::osm::unsupported/missing config schema version - '#{@osmConfigSchemaVersion}' , using defaults, please use supported schema version") + else + puts "config::No configmap mounted for OSM config, using defaults" + end +end +puts "****************End OSM Config Processing********************" diff --git a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb b/build/linux/installer/scripts/tomlparser-prom-customconfig.rb deleted file mode 100644 index 7aad580ee..000000000 --- a/build/linux/installer/scripts/tomlparser-prom-customconfig.rb +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/local/bin/ruby - -require_relative "tomlrb" -require_relative "ConfigParseErrorLogger" -require "fileutils" - -@promConfigMapMountPath = "/etc/config/settings/prometheus-data-collection-settings" -@replicaset = "replicaset" -@daemonset = "daemonset" -@configSchemaVersion = "" -@defaultDsInterval = "1m" -@defaultDsPromUrls = [] -@defaultDsFieldPass = [] -@defaultDsFieldDrop = [] -@defaultRsInterval = "1m" -@defaultRsPromUrls = [] -@defaultRsFieldPass = [] -@defaultRsFieldDrop = [] -@defaultRsK8sServices = [] -@defaultRsMonitorPods = false - -#Configurations to be used for the auto-generated input prometheus plugins for namespace filtering -@metricVersion = 2 -@urlTag = "scrapeUrl" -@bearerToken = "/var/run/secrets/kubernetes.io/serviceaccount/token" -@responseTimeout = "15s" -@tlsCa = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" -@insecureSkipVerify = true - -# Use parser to parse the configmap toml file to a ruby structure -def parseConfigMap - begin - # Check to see if config map is created - if (File.file?(@promConfigMapMountPath)) - puts "config::configmap container-azm-ms-agentconfig for settings mounted, parsing values for prometheus config map" - parsedConfig = Tomlrb.load_file(@promConfigMapMountPath, symbolize_keys: true) - puts "config::Successfully parsed mounted prometheus config map" - return parsedConfig - else - puts "config::configmap container-azm-ms-agentconfig for settings not mounted, using defaults for prometheus scraping" - return nil - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config map for prometheus config: #{errorStr}, using defaults, please check config map for errors") - return nil - end -end - -def checkForTypeArray(arrayValue, arrayType) - if (arrayValue.nil? || (arrayValue.kind_of?(Array) && ((arrayValue.length == 0) || (arrayValue.length > 0 && arrayValue[0].kind_of?(arrayType))))) - return true - else - return false - end -end - -def checkForType(variable, varType) - if variable.nil? || variable.kind_of?(varType) - return true - else - return false - end -end - -def replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", ("monitor_kubernetes_pods = #{monitorKubernetesPods}")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", "") - rescue => errorStr - puts "Exception while replacing default pod monitor settings: #{errorStr}" - end - return new_contents -end - -def createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - begin - new_contents = new_contents.gsub("$AZMON_RS_PROM_MONITOR_PODS", "# Commenting this out since new plugins will be created per namespace\n # $AZMON_RS_PROM_MONITOR_PODS") - pluginConfigsWithNamespaces = "" - monitorKubernetesPodsNamespaces.each do |namespace| - if !namespace.nil? - #Stripping namespaces to remove leading and trailing whitespaces - namespace.strip! - if namespace.length > 0 - pluginConfigsWithNamespaces += "\n[[inputs.prometheus]] - interval = \"#{interval}\" - monitor_kubernetes_pods = true - monitor_kubernetes_pods_namespace = \"#{namespace}\" - fieldpass = #{fieldPassSetting} - fielddrop = #{fieldDropSetting} - metric_version = #{@metricVersion} - url_tag = \"#{@urlTag}\" - bearer_token = \"#{@bearerToken}\" - response_timeout = \"#{@responseTimeout}\" - tls_ca = \"#{@tlsCa}\" - insecure_skip_verify = #{@insecureSkipVerify}\n" - end - end - end - new_contents = new_contents.gsub("$AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER", pluginConfigsWithNamespaces) - return new_contents - rescue => errorStr - puts "Exception while creating prometheus input plugins to filter namespaces: #{errorStr}, using defaults" - replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - end -end - -# Use the ruby structure created after config parsing to set the right values to be used as environment variables -def populateSettingValuesFromConfigMap(parsedConfig) - # Checking to see if this is the daemonset or replicaset to parse config accordingly - controller = ENV["CONTROLLER_TYPE"] - if !controller.nil? - if !parsedConfig.nil? && !parsedConfig[:prometheus_data_collection_settings].nil? - if controller.casecmp(@replicaset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:cluster].nil? - #Get prometheus replicaset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:cluster][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:cluster][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:cluster][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:cluster][:urls] - kubernetesServices = parsedConfig[:prometheus_data_collection_settings][:cluster][:kubernetes_services] - monitorKubernetesPods = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods] - monitorKubernetesPodsNamespaces = parsedConfig[:prometheus_data_collection_settings][:cluster][:monitor_kubernetes_pods_namespaces] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(kubernetesServices, String) && - checkForTypeArray(urls, String) && - (monitorKubernetesPods.nil? || (!monitorKubernetesPods.nil? && (!!monitorKubernetesPods == monitorKubernetesPods))) #Checking for Boolean type, since 'Boolean' is not defined as a type in ruby - puts "config::Successfully passed typecheck for config settings for replicaset" - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultRsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultRsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultRsFieldDrop : fieldDrop - kubernetesServices = (kubernetesServices.nil?) ? @defaultRsK8sServices : kubernetesServices - urls = (urls.nil?) ? @defaultRsPromUrls : urls - monitorKubernetesPods = (monitorKubernetesPods.nil?) ? @defaultRsMonitorPods : monitorKubernetesPods - - file_name = "/opt/telegraf-test-rs.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for replicaset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_RS_PROM_INTERVAL", interval) - fieldPassSetting = (fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDPASS", fieldPassSetting) - fieldDropSetting = (fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]" - new_contents = new_contents.gsub("$AZMON_RS_PROM_FIELDDROP", fieldDropSetting) - new_contents = new_contents.gsub("$AZMON_RS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_RS_PROM_K8S_SERVICES", ((kubernetesServices.length > 0) ? ("[\"" + kubernetesServices.join("\",\"") + "\"]") : "[]")) - - # Check to see if monitor_kubernetes_pods is set to true with a valid setting for monitor_kubernetes_namespaces to enable scraping for specific namespaces - # Adding nil check here as well since checkForTypeArray returns true even if setting is nil to accomodate for other settings to be able - - # - to use defaults in case of nil settings - if monitorKubernetesPods && !monitorKubernetesPodsNamespaces.nil? && checkForTypeArray(monitorKubernetesPodsNamespaces, String) - new_contents = createPrometheusPluginsWithNamespaceSetting(monitorKubernetesPods, monitorKubernetesPodsNamespaces, new_contents, interval, fieldPassSetting, fieldDropSetting) - monitorKubernetesPodsNamespacesLength = monitorKubernetesPodsNamespaces.length - else - new_contents = replaceDefaultMonitorPodSettings(new_contents, monitorKubernetesPods) - monitorKubernetesPodsNamespacesLength = 0 - end - - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_RS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_RS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH=#{kubernetesServices.length}\n") - file.write("export TELEMETRY_RS_PROM_URLS_LENGTH=#{urls.length}\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") - file.write("export TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") - - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for replicaset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for replicaset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for replicaset: #{errorStr}, using defaults") - setRsPromDefaults - puts "****************End Prometheus Config Processing********************" - end - elsif controller.casecmp(@daemonset) == 0 && !parsedConfig[:prometheus_data_collection_settings][:node].nil? - #Get prometheus daemonset custom config settings - begin - interval = parsedConfig[:prometheus_data_collection_settings][:node][:interval] - fieldPass = parsedConfig[:prometheus_data_collection_settings][:node][:fieldpass] - fieldDrop = parsedConfig[:prometheus_data_collection_settings][:node][:fielddrop] - urls = parsedConfig[:prometheus_data_collection_settings][:node][:urls] - - # Check for the right datattypes to enforce right setting values - if checkForType(interval, String) && - checkForTypeArray(fieldPass, String) && - checkForTypeArray(fieldDrop, String) && - checkForTypeArray(urls, String) - puts "config::Successfully passed typecheck for config settings for daemonset" - - #if setting is nil assign default values - interval = (interval.nil?) ? @defaultDsInterval : interval - fieldPass = (fieldPass.nil?) ? @defaultDsFieldPass : fieldPass - fieldDrop = (fieldDrop.nil?) ? @defaultDsFieldDrop : fieldDrop - urls = (urls.nil?) ? @defaultDsPromUrls : urls - - file_name = "/opt/telegraf-test.conf" - # Copy the telegraf config file to a temp file to run telegraf in test mode with this config - FileUtils.cp("/etc/opt/microsoft/docker-cimprov/telegraf.conf", file_name) - - puts "config::Starting to substitute the placeholders in telegraf conf copy file for daemonset" - #Replace the placeholder config values with values from custom config - text = File.read(file_name) - new_contents = text.gsub("$AZMON_DS_PROM_INTERVAL", interval) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDPASS", ((fieldPass.length > 0) ? ("[\"" + fieldPass.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_FIELDDROP", ((fieldDrop.length > 0) ? ("[\"" + fieldDrop.join("\",\"") + "\"]") : "[]")) - new_contents = new_contents.gsub("$AZMON_DS_PROM_URLS", ((urls.length > 0) ? ("[\"" + urls.join("\",\"") + "\"]") : "[]")) - File.open(file_name, "w") { |file| file.puts new_contents } - puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" - - #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") - if !file.nil? - file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") - #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_DS_PROM_FIELDPASS_LENGTH=\"#{fieldPass.length}\"\n") - file.write("export TELEMETRY_DS_PROM_FIELDDROP_LENGTH=\"#{fieldDrop.length}\"\n") - file.write("export TELEMETRY_DS_PROM_URLS_LENGTH=#{urls.length}\n") - # Close file after writing all environment variables - file.close - puts "config::Successfully created telemetry file for daemonset" - end - else - ConfigParseErrorLogger.logError("Typecheck failed for prometheus config settings for daemonset, using defaults, please use right types for all settings") - end # end of type check condition - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while parsing config file for prometheus config for daemonset: #{errorStr}, using defaults, please check correctness of configmap") - puts "****************End Prometheus Config Processing********************" - end - end # end of controller type check - end - else - ConfigParseErrorLogger.logError("Controller undefined while processing prometheus config, using defaults") - end -end - -@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] -puts "****************Start Prometheus Config Processing********************" -if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it - configMapSettings = parseConfigMap - if !configMapSettings.nil? - populateSettingValuesFromConfigMap(configMapSettings) - end -else - if (File.file?(@promConfigMapMountPath)) - ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported version") - else - puts "config::No configmap mounted for prometheus custom config, using defaults" - end -end -puts "****************End Prometheus Config Processing********************" diff --git a/build/windows/installer/conf/fluent-bit.conf b/build/windows/installer/conf/fluent-bit.conf index 879ee4810..1eebe5fd6 100644 --- a/build/windows/installer/conf/fluent-bit.conf +++ b/build/windows/installer/conf/fluent-bit.conf @@ -12,6 +12,15 @@ Chunk_Size 32 Buffer_Size 64 +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 32 + Buffer_Size 64 + Mem_Buf_Limit 5m + [OUTPUT] Name oms EnableTelemetry true diff --git a/build/windows/installer/conf/telegraf.conf b/build/windows/installer/conf/telegraf.conf new file mode 100644 index 000000000..5f4d2364e --- /dev/null +++ b/build/windows/installer/conf/telegraf.conf @@ -0,0 +1,162 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply prepend +# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), +# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) + + +# Global tags can be specified here in key="value" format. +[global_tags] + hostName = "placeholder_hostname" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "60s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "15s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = false + ## Run telegraf in quiet mode (error log messages only). + quiet = true + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + ## Override default hostname, if empty use os.Hostname() + #hostname = "placeholder_hostname" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Generic socket writer capable of handling multiple socket types. +[[outputs.socket_writer]] + ## URL to connect to + address = "tcp://0.0.0.0:25229" + # address = "tcp://example.com:http" + # address = "tcp4://127.0.0.1:8094" + # address = "tcp6://127.0.0.1:8094" + # address = "tcp6://[2001:db8::1]:8094" + # address = "udp://127.0.0.1:8094" + # address = "udp4://127.0.0.1:8094" + # address = "udp6://127.0.0.1:8094" + # address = "unix:///tmp/telegraf.sock" + # address = "unixgram:///tmp/telegraf.sock" + + ## Optional TLS Config + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## Period between keep alive probes. + ## Only applies to TCP sockets. + ## 0 disables keep alive probes. + ## Defaults to the OS configuration. + # keep_alive_period = "5m" + + ## Data format to generate. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + namedrop = ["agent_telemetry"] + #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +[[processors.converter]] + [processors.converter.fields] + float = ["*"] + +#Prometheus Custom Metrics +[[inputs.prometheus]] + interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" + + ## Scrape Kubernetes pods for the following prometheus annotations: + ## - prometheus.io/scrape: Enable scraping for this pod + ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to + ## set this to `https` & most likely set the tls config. + ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + ## - prometheus.io/port: If port is not 9102 use this annotation + $AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS + $AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR + $AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR + + fieldpass = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS + fielddrop = $AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP + + metric_version = 2 + url_tag = "scrapeUrl" + ## Kubernetes config file to create client from. + # kube_config = "/path/to/kubernetes.config" + + ## Use bearer token for authorization. ('bearer_token' takes priority) + bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token" + ## OR + # bearer_token_string = "abc_123" + + ## Specify timeout duration for slower prometheus clients (default is 3s) + response_timeout = "15s" + + ## Optional TLS Config + tls_ca = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + #tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + insecure_skip_verify = true + +$AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 82d210f3d..8868b86bb 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -81,6 +81,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SIDECAR_SCRAPING_ENABLED + value: "false" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 37b8faacc..9b6656e9c 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -72,7 +72,9 @@ spec: value: {{ .Values.Azure.Extension.Name | quote }} {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" + - name: SIDECAR_SCRAPING_ENABLED + value: "false" - name: ISTEST value: {{ .Values.omsagent.ISTEST | quote }} securityContext: @@ -109,6 +111,9 @@ spec: - mountPath: /etc/config/settings/adx name: omsagent-adx-secret readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true livenessProbe: exec: command: @@ -157,5 +162,9 @@ spec: - name: omsagent-adx-secret secret: secretName: omsagent-adx-secret - optional: true + optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true {{- end }} diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index aec1bb456..e38d9b4ab 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -76,6 +76,17 @@ data: ## ex: monitor_kubernetes_pods_namespaces = ["default1", "default2", "default3"] # monitor_kubernetes_pods_namespaces = ["default1"] + ## Label selector to target pods which have the specified label + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + # kubernetes_label_selector = "env=dev,app=nginx" + + ## Field selector to target pods which have the specified field + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ + ## eg. To scrape pods on a specific node + # kubernetes_field_selector = "spec.nodeName=$HOSTNAME" + [prometheus_data_collection_settings.node] # Node level scrape endpoint(s). These metrics will be scraped from agent's DaemonSet running in every node in the cluster # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to. diff --git a/kubernetes/container-azm-ms-osmconfig.yaml b/kubernetes/container-azm-ms-osmconfig.yaml new file mode 100644 index 000000000..05b7ac3ed --- /dev/null +++ b/kubernetes/container-azm-ms-osmconfig.yaml @@ -0,0 +1,17 @@ +kind: ConfigMap +apiVersion: v1 +data: + schema-version: + #string.used by agent to parse OSM config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent. + v1 + config-version: + #string.used by OSM addon team to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated) + ver1 + osm-metric-collection-configuration: |- + # OSM metric collection settings + [osm_metric_collection_configuration.settings] + # Namespaces to monitor + # monitor_namespaces = ["namespace1", "namespace2"] +metadata: + name: container-azm-ms-osmconfig + namespace: kube-system diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index bee718a31..bcdc31330 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV KUBE_CLIENT_BACKOFF_BASE 1 ENV KUBE_CLIENT_BACKOFF_DURATION 0 ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* -COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs mdsd.xml envmdsd $tmpdir/ +COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd $tmpdir/ WORKDIR ${tmpdir} # copy docker provider shell bundle to use the agent image diff --git a/kubernetes/linux/defaultpromenvvariables-rs b/kubernetes/linux/defaultpromenvvariables-rs index 1346e62b9..920f4e90e 100644 --- a/kubernetes/linux/defaultpromenvvariables-rs +++ b/kubernetes/linux/defaultpromenvvariables-rs @@ -1,7 +1,12 @@ -export AZMON_RS_PROM_INTERVAL="1m" -export AZMON_RS_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" -export AZMON_RS_PROM_FIELDPASS="[]" -export AZMON_RS_PROM_FIELDDROP="[]" -export AZMON_RS_PROM_URLS="[]" -export AZMON_RS_PROM_K8S_SERVICES="[]" -export AZMON_RS_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'cluster'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_URLS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" + diff --git a/kubernetes/linux/defaultpromenvvariables-sidecar b/kubernetes/linux/defaultpromenvvariables-sidecar new file mode 100644 index 000000000..3301488d8 --- /dev/null +++ b/kubernetes/linux/defaultpromenvvariables-sidecar @@ -0,0 +1,9 @@ +export AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL="1m" +export AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS="monitor_kubernetes_pods = false" +export AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE="pod_scrape_scope = 'node'" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP="[]" +export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" +export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" +export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c4067f25e..71e46875b 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -2,7 +2,17 @@ if [ -e "/etc/config/kube.conf" ]; then cat /etc/config/kube.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf +elif [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "setting omsagent conf file for prometheus sidecar" + cat /etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf + # omsadmin.sh replaces %MONITOR_AGENT_PORT% and %SYSLOG_PORT% in the monitor.conf and syslog.conf with default ports 25324 and 25224. + # Since we are running 2 omsagents in the same pod, we need to use a different port for the sidecar, + # else we will see the Address already in use - bind(2) for 0.0.0.0:253(2)24 error. + # Look into omsadmin.sh scripts's configure_monitor_agent()/configure_syslog() and find_available_port() methods for more info. + sed -i -e 's/port %MONITOR_AGENT_PORT%/port 25326/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/monitor.conf + sed -i -e 's/port %SYSLOG_PORT%/port 25226/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf else + echo "setting omsagent conf file for daemonset" sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf fi sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf @@ -28,6 +38,12 @@ sudo setfacl -m user:omsagent:rwx /var/opt/microsoft/docker-cimprov/log #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +#Run inotify as a daemon to track changes to the mounted configmap for OSM settings. +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + inotifywait /etc/config/osm-settings --daemon --recursive --outfile "/opt/inotifyoutput-osm.txt" --event create,delete --format '%e : %T' --timefmt '+%s' +fi + #resourceid override for loganalytics data. if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" @@ -68,6 +84,24 @@ if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/ echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi +#set OSM config schema version +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + if [ -e "/etc/config/osm-settings/schema-version" ] && [ -s "/etc/config/osm-settings/schema-version" ]; then + #trim + osm_config_schema_version="$(cat /etc/config/osm-settings/schema-version | xargs)" + #remove all spaces + osm_config_schema_version="${osm_config_schema_version//[[:space:]]/}" + #take first 10 characters + osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" + + export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version + echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc + source ~/.bashrc + echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" + fi +fi + export PROXY_ENDPOINT="" # Check for internet connectivity or workspace deletion @@ -193,71 +227,58 @@ echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc source ~/.bashrc +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + #Parse the configmap to set the right environment variables. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb -#Parse the configmap to set the right environment variables. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb - -cat config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source config_env_var - + cat config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_env_var +fi #Parse the configmap to set the right environment variables for agent config. #Note > tomlparser-agent-config.rb has to be parsed first before td-agent-bit-conf-customizer.rb for fbit agent settings -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb -cat agent_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source agent_config_env_var + cat agent_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source agent_config_env_var -#Parse the configmap to set the right environment variables for network policy manager (npm) integration. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb + #Parse the configmap to set the right environment variables for network policy manager (npm) integration. + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb -cat integration_npm_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc -done -source integration_npm_config_env_var + cat integration_npm_config_env_var | while read line; do + #echo $line + echo $line >> ~/.bashrc + done + source integration_npm_config_env_var +fi #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset -if [ ! -e "/etc/config/kube.conf" ]; then +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb fi #Parse the prometheus configmap to create a file with new custom settings. /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-prom-customconfig.rb -#If config parsing was successful, a copy of the conf file with replaced custom settings file is created -if [ ! -e "/etc/config/kube.conf" ]; then - if [ -e "/opt/telegraf-test.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -else - if [ -e "/opt/telegraf-test-rs.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-rs.conf -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" - fi - echo "****************End Telegraf Run in Test Mode**************************" - fi -fi - #Setting default environment variables to be used in any case of failure in the above steps if [ ! -e "/etc/config/kube.conf" ]; then - cat defaultpromenvvariables | while read line; do - echo $line >> ~/.bashrc - done - source defaultpromenvvariables + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + cat defaultpromenvvariables-sidecar | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables-sidecar + else + cat defaultpromenvvariables | while read line; do + echo $line >> ~/.bashrc + done + source defaultpromenvvariables + fi else cat defaultpromenvvariables-rs | while read line; do echo $line >> ~/.bashrc @@ -273,21 +294,37 @@ if [ -e "telemetry_prom_config_env_var" ]; then source telemetry_prom_config_env_var fi + #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb -cat config_mdm_metrics_env_var | while read line; do - echo $line >> ~/.bashrc -done -source config_mdm_metrics_env_var + cat config_mdm_metrics_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_mdm_metrics_env_var -#Parse the configmap to set the right environment variables for metric collection settings -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + #Parse the configmap to set the right environment variables for metric collection settings + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb -cat config_metric_collection_env_var | while read line; do - echo $line >> ~/.bashrc -done -source config_metric_collection_env_var + cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc + done + source config_metric_collection_env_var +fi + +# OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two) +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then + /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-osm-config.rb + + if [ -e "integration_osm_config_env_var" ]; then + cat integration_osm_config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source integration_osm_config_env_var + fi +fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" @@ -511,7 +548,7 @@ fi #start oneagent -if [ ! -e "/etc/config/kube.conf" ]; then +if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE" echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" @@ -552,18 +589,56 @@ if [ ! -e "/etc/config/kube.conf" ]; then fi echo "************end oneagent log routing checks************" +#If config parsing was successful, a copy of the conf file with replaced custom settings file is created +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ] && [ -e "/opt/telegraf-test-prom-side-car.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-prom-side-car.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + else + if [ -e "/opt/telegraf-test.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi + fi +else + if [ -e "/opt/telegraf-test-rs.conf" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-rs.conf -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + fi + echo "****************End Telegraf Run in Test Mode**************************" + fi +fi + #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then - if [ "$CONTAINER_RUNTIME" == "docker" ]; then - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" else - echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" - sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + echo "starting fluent-bit and setting telegraf conf file for daemonset" + if [ "$CONTAINER_RUNTIME" == "docker" ]; then + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + else + echo "since container run time is $CONTAINER_RUNTIME update the container log fluentbit Parser to cri from docker" + sed -i 's/Parser.docker*/Parser cri/' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf.conf" + fi fi else + echo "starting fluent-bit and setting telegraf conf file for replicaset" /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf -e /opt/td-agent-bit/bin/out_oms.so & telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" fi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index fe6c0565a..218e3c717 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -60,7 +60,13 @@ sudo apt-get install libcap2-bin -y #service telegraf stop -wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf +#wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf + +#1.18 pre-release +wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz +tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz + +mv /opt/telegraf-1.18.0/usr/bin/telegraf /opt/telegraf chmod 777 /opt/telegraf diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index ebf0257af..c25b9bfd4 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -443,6 +443,59 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 +#Only in sidecar scraping mode + - name: omsagent-prometheus + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 500m + memory: 400Mi + requests: + cpu: 75m + memory: 225Mi + env: + # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID_VALUE" + - name: AKS_REGION + value: "VALUE_AKS_RESOURCE_REGION_VALUE" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + #- name: ACS_RESOURCE_NAME + # value: "my_acs_cluster_name" + - name: CONTAINER_TYPE + value: "PrometheusSidecar" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + # Update this with the user assigned msi client id for omsagent + - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + value: "" + securityContext: + privileged: true + volumeMounts: + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -502,6 +555,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: Deployment @@ -559,6 +616,9 @@ spec: # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" + # Add the below environment variable to true only in sidecar enabled regions, else set it to false + - name: SIDECAR_SCRAPING_ENABLED + value: "true" securityContext: privileged: true ports: @@ -586,6 +646,8 @@ spec: readOnly: true - mountPath: /etc/config/settings/adx name: omsagent-adx-secret + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config readOnly: true livenessProbe: exec: @@ -658,6 +720,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true --- apiVersion: apps/v1 kind: DaemonSet @@ -711,10 +777,16 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: PODNAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: NODE_IP valueFrom: fieldRef: fieldPath: status.hostIP + - name: SIDECAR_SCRAPING_ENABLED + value: "true" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index d4f118449..c0bebcc93 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -47,6 +47,7 @@ RUN ./setup.ps1 COPY main.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/filesystemwatcher.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/livenessprobe.cmd /opt/omsagentwindows/scripts/cmd/ +COPY setdefaulttelegrafenvvariables.ps1 /opt/omsagentwindows/scripts/powershell # copy ruby scripts to /opt folder COPY ./omsagentwindows/installer/scripts/*.rb /opt/omsagentwindows/scripts/ruby/ @@ -62,6 +63,9 @@ COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows +# copy telegraf conf file +COPY ./omsagentwindows/installer/conf/telegraf.conf /etc/telegraf/ + # copy keepcert alive ruby scripts COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/fluent/plugin/ diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 722392157..95cba2579 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -273,9 +273,9 @@ function Get-ContainerRuntime { return $containerRuntime } -function Start-Fluent { +function Start-Fluent-Telegraf { - # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service. + # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service and telegraf service. # Run fluent-bit as a background job. Switch this to a windows service once fluent-bit supports natively running as a windows service Start-Job -ScriptBlock { Start-Process -NoNewWindow -FilePath "C:\opt\fluent-bit\bin\fluent-bit.exe" -ArgumentList @("-c", "C:\etc\fluent-bit\fluent-bit.conf", "-e", "C:\opt\omsagentwindows\out_oms.so") } @@ -289,35 +289,99 @@ function Start-Fluent { (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf','fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf } + # Start telegraf only in sidecar scraping mode + $sidecarScrapingEnabled = [System.Environment]::GetEnvironmentVariable('SIDECAR_SCRAPING_ENABLED') + if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') + { + Write-Host "Starting telegraf..." + Start-Telegraf + } + fluentd --reg-winsvc i --reg-winsvc-auto-start --winsvc-name fluentdwinaks --reg-winsvc-fluentdopt '-c C:/etc/fluent/fluent.conf -o C:/etc/fluent/fluent.log' Notepad.exe | Out-Null } -function Generate-Certificates { - Write-Host "Generating Certificates" - C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe -} +function Start-Telegraf { + # Set default telegraf environment variables for prometheus scraping + Write-Host "**********Setting default environment variables for telegraf prometheus plugin..." + .\setdefaulttelegrafenvvariables.ps1 + + # run prometheus custom config parser + Write-Host "**********Running config parser for custom prometheus scraping**********" + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-prom-customconfig.rb + Write-Host "**********End running config parser for custom prometheus scraping**********" + + + # Set required environment variable for telegraf prometheus plugin to run properly + Write-Host "Setting required environment variables for telegraf prometheus input plugin to run properly..." + $kubernetesServiceHost = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_HOST", "process") + if (![string]::IsNullOrEmpty($kubernetesServiceHost)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_HOST", $kubernetesServiceHost, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_HOST - $($kubernetesServiceHost) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_HOST for target 'machine' since it is either null or empty" + } + + $kubernetesServicePort = [System.Environment]::GetEnvironmentVariable("KUBERNETES_SERVICE_PORT", "process") + if (![string]::IsNullOrEmpty($kubernetesServicePort)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_SERVICE_PORT", $kubernetesServicePort, "machine") + Write-Host "Successfully set environment variable KUBERNETES_SERVICE_PORT - $($kubernetesServicePort) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" + } + + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") + if (![string]::IsNullOrEmpty($nodeIp)) { + [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") + Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" + } -function Bootstrap-CACertificates { + Write-Host "Installing telegraf service" + C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" + + # Setting delay auto start for telegraf since there have been known issues with windows server and telegraf - + # https://github.com/influxdata/telegraf/issues/4081 + # https://github.com/influxdata/telegraf/issues/3601 try { - # This is required when the root CA certs are different for some clouds. - $caCerts=Invoke-WebRequest 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing | ConvertFrom-Json - if (![string]::IsNullOrEmpty($caCerts)) { - $certificates = $caCerts.Certificates - for ($index = 0; $index -lt $certificates.Length ; $index++) { - $name=$certificates[$index].Name - $certificates[$index].CertBody > $name - Write-Host "name: $($name)" - Import-Certificate -FilePath .\$name -CertStoreLocation 'Cert:\LocalMachine\Root' -Verbose - } + $serverName = [System.Environment]::GetEnvironmentVariable("PODNAME", "process") + if (![string]::IsNullOrEmpty($serverName)) { + sc.exe \\$serverName config telegraf start= delayed-auto + Write-Host "Successfully set delayed start for telegraf" + + } else { + Write-Host "Failed to get environment variable PODNAME to set delayed telegraf start" } } catch { - $e = $_.Exception - Write-Host $e - Write-Host "exception occured in Bootstrap-CACertificates..." + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in delayed telegraf start.. continuing without exiting" } + Write-Host "Running telegraf service in test mode" + C:\opt\telegraf\telegraf.exe --config "C:\etc\telegraf\telegraf.conf" --test + Write-Host "Starting telegraf service" + C:\opt\telegraf\telegraf.exe --service start + + # Trying to start telegraf again if it did not start due to fluent bit not being ready at startup + Get-Service telegraf | findstr Running + if ($? -eq $false) + { + Write-Host "trying to start telegraf in again in 30 seconds, since fluentbit might not have been ready..." + Start-Sleep -s 30 + C:\opt\telegraf\telegraf.exe --service start + Get-Service telegraf + } +} + +function Generate-Certificates { + Write-Host "Generating Certificates" + C:\\opt\\omsagentwindows\\certgenerator\\certificategenerator.exe } function Test-CertificatePath { @@ -346,16 +410,9 @@ Remove-WindowsServiceIfItExists "fluentdwinaks" Set-EnvironmentVariables Start-FileSystemWatcher -#Bootstrapping CA certs for non public clouds and AKS clusters -$aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID") -if (![string]::IsNullOrEmpty($aksResourceId) -and $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) -{ - Bootstrap-CACertificates -} - Generate-Certificates Test-CertificatePath -Start-Fluent +Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | Format-Table -Property Name, CommandLine, ProcessId diff --git a/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 new file mode 100644 index 000000000..269894139 --- /dev/null +++ b/kubernetes/windows/setdefaulttelegrafenvvariables.ps1 @@ -0,0 +1,17 @@ +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL", "1m", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_MONITOR_PODS", "monitor_kubernetes_pods = false", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_SCRAPE_SCOPE", "pod_scrape_scope = 'node'", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDPASS", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_FIELDDROP", "[]", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER", " ", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR", "kubernetes_label_selector = ''", "machine") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "process") +[System.Environment]::SetEnvironmentVariable("AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR", "kubernetes_field_selector = ''", "machine") + diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index dd6d52a11..25aad5e16 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -8,10 +8,12 @@ Write-Host ('Creating folder structure') New-Item -Type Directory -Path /opt/fluent-bit New-Item -Type Directory -Path /opt/scripts/ruby + New-Item -Type Directory -Path /opt/telegraf New-Item -Type Directory -Path /etc/fluent-bit New-Item -Type Directory -Path /etc/fluent New-Item -Type Directory -Path /etc/omsagentwindows + New-Item -Type Directory -Path /etc/telegraf New-Item -Type Directory -Path /etc/config/settings/ New-Item -Type Directory -Path /etc/config/adx/ @@ -32,6 +34,20 @@ Write-Host ('Installing Fluent Bit'); } Write-Host ('Finished Installing Fluentbit') +Write-Host ('Installing Telegraf'); +try { + $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_windows_amd64.zip' + Invoke-WebRequest -Uri $telegrafUri -OutFile /installation/telegraf.zip + Expand-Archive -Path /installation/telegraf.zip -Destination /installation/telegraf + Move-Item -Path /installation/telegraf/*/* -Destination /opt/telegraf/ -ErrorAction SilentlyContinue +} +catch { + $ex = $_.Exception + Write-Host "exception while downloading telegraf for windows" + Write-Host $ex + exit 1 +} +Write-Host ('Finished downloading Telegraf') Write-Host ('Installing Visual C++ Redistributable Package') $vcRedistLocation = 'https://aka.ms/vs/16/release/vc_redist.x64.exe' diff --git a/scripts/build/windows/install-build-pre-requisites.ps1 b/scripts/build/windows/install-build-pre-requisites.ps1 index b5e6e2d18..3bb56ac2a 100755 --- a/scripts/build/windows/install-build-pre-requisites.ps1 +++ b/scripts/build/windows/install-build-pre-requisites.ps1 @@ -21,7 +21,7 @@ function Install-Go { # install go lang Write-Host("installing go ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing go completed") Write-Host "updating PATH variable" @@ -102,7 +102,7 @@ function Install-DotNetCoreSDK() { # install dotNet core sdk Write-Host("installing .net core sdk 3.1 ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing .net core sdk 3.1 completed") } @@ -129,7 +129,7 @@ function Install-Docker() { # install docker Write-Host("installing docker for desktop ...") - Start-Process msiexec.exe -Wait -ArgumentList '/I ' + $output + '/quiet' + Start-Process msiexec.exe -Wait -ArgumentList '/I ', $output, '/quiet' Write-Host("installing docker for desktop completed") } diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 0bd983297..d35acad3d 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1491,4 +1491,4 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } -} +} \ No newline at end of file diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 3d30ac5aa..48f82a9ab 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -10,9 +10,9 @@ import ( "strings" "time" + "github.com/fluent/fluent-bit-go/output" "github.com/microsoft/ApplicationInsights-Go/appinsights" "github.com/microsoft/ApplicationInsights-Go/appinsights/contracts" - "github.com/fluent/fluent-bit-go/output" ) var ( @@ -44,33 +44,45 @@ var ( ContainerLogsMDSDClientCreateErrors float64 //Tracks the number of write/send errors to ADX for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsSendErrorsToADXFromFluent float64 - //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) + //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsADXClientCreateErrors float64 + //Tracks the number of OSM namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + OSMNamespaceCount int + //Tracks whether monitor kubernetes pods is set to true and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPods string + //Tracks the number of monitor kubernetes pods namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsNamespaceLength int + //Tracks the number of monitor kubernetes pods label selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsLabelSelectorLength int + //Tracks the number of monitor kubernetes pods field selectors and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) + PromMonitorPodsFieldSelectorLength int ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" - envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" - metricNameLogSize = "ContainerLogsSize" - metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" - metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" - metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" - metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" - metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" - metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" - metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" - metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + envAppInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + metricNameAvgLogGenerationRate = "ContainerLogsGeneratedPerSec" + metricNameLogSize = "ContainerLogsSize" + metricNameAgentLogProcessingMaxLatencyMs = "ContainerLogsAgentSideLatencyMs" + metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" + metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" + metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" + metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" + metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" + metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" + metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" defaultTelemetryPushIntervalSeconds = 300 - eventNameContainerLogInit = "ContainerLogPluginInitialized" - eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" + eventNameCustomPrometheusSidecarHeartbeat = "CustomPrometheusSidecarHeartbeatEvent" + eventNameWindowsFluentBitHeartbeat = "WindowsFluentBitHeartbeatEvent" ) // SendContainerLogPluginMetrics is a go-routine that flushes the data periodically (every 5 mins to App Insights) @@ -100,6 +112,11 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent containerLogsADXClientCreateErrors := ContainerLogsADXClientCreateErrors + osmNamespaceCount := OSMNamespaceCount + promMonitorPods := PromMonitorPods + promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength + promMonitorPodsLabelSelectorLength := PromMonitorPodsLabelSelectorLength + promMonitorPodsFieldSelectorLength := PromMonitorPodsFieldSelectorLength TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 @@ -118,17 +135,39 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) - flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) - TelemetryClient.Track(flushRateMetric) - logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) - logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) - TelemetryClient.Track(logRateMetric) - Log("Log Size Rate: %f\n", logSizeRate) - TelemetryClient.Track(logSizeMetric) - logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) - logLatencyMetric.Properties["Container"] = logLatencyMsContainer - TelemetryClient.Track(logLatencyMetric) + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + telemetryDimensions := make(map[string]string) + telemetryDimensions["CustomPromMonitorPods"] = promMonitorPods + if promMonitorPodsNamespaceLength > 0 { + telemetryDimensions["CustomPromMonitorPodsNamespaceLength"] = strconv.Itoa(promMonitorPodsNamespaceLength) + } + if promMonitorPodsLabelSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsLabelSelectorLength"] = strconv.Itoa(promMonitorPodsLabelSelectorLength) + } + if promMonitorPodsFieldSelectorLength > 0 { + telemetryDimensions["CustomPromMonitorPodsFieldSelectorLength"] = strconv.Itoa(promMonitorPodsFieldSelectorLength) + } + if osmNamespaceCount > 0 { + telemetryDimensions["OsmNamespaceCount"] = strconv.Itoa(osmNamespaceCount) + } + + SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) + + } else if strings.Compare(strings.ToLower(os.Getenv("OS_TYPE")), "windows") == 0 { + SendEvent(eventNameWindowsFluentBitHeartbeat, make(map[string]string)) + } else { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(flushRateMetric) + logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) + logSizeMetric := appinsights.NewMetricTelemetry(metricNameLogSize, logSizeRate) + TelemetryClient.Track(logRateMetric) + Log("Log Size Rate: %f\n", logSizeRate) + TelemetryClient.Track(logSizeMetric) + logLatencyMetric := appinsights.NewMetricTelemetry(metricNameAgentLogProcessingMaxLatencyMs, logLatencyMs) + logLatencyMetric.Properties["Container"] = logLatencyMsContainer + TelemetryClient.Track(logLatencyMetric) + } } TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofTelegrafMetricsSentSuccessfully, telegrafMetricsSentCount)) if telegrafMetricsSendErrorCount > 0.0 { @@ -255,12 +294,60 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } if isProxyConfigured == true { - CommonProperties["IsProxyConfigured"] = "true" + CommonProperties["IsProxyConfigured"] = "true" } else { - CommonProperties["IsProxyConfigured"] = "false" - } + CommonProperties["IsProxyConfigured"] = "false" + } + + // Adding container type to telemetry + if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { + CommonProperties["ContainerType"] = "prometheussidecar" + } + } TelemetryClient.Context().CommonProperties = CommonProperties + + // Getting the namespace count, monitor kubernetes pods values and namespace count once at start because it wont change unless the configmap is applied and the container is restarted + + OSMNamespaceCount = 0 + osmNsCount := os.Getenv("TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT") + if osmNsCount != "" { + OSMNamespaceCount, err = strconv.Atoi(osmNsCount) + if err != nil { + Log("OSM namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPods = os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS") + + PromMonitorPodsNamespaceLength = 0 + promMonPodsNamespaceLength := os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH") + if promMonPodsNamespaceLength != "" { + PromMonitorPodsNamespaceLength, err = strconv.Atoi(promMonPodsNamespaceLength) + if err != nil { + Log("Custom prometheus monitor kubernetes pods namespace count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsLabelSelectorLength = 0 + promLabelSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH") + if promLabelSelectorLength != "" { + PromMonitorPodsLabelSelectorLength, err = strconv.Atoi(promLabelSelectorLength) + if err != nil { + Log("Custom prometheus label selector count string to int conversion error %s", err.Error()) + } + } + + PromMonitorPodsFieldSelectorLength = 0 + promFieldSelectorLength := os.Getenv("TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH") + if promFieldSelectorLength != "" { + PromMonitorPodsFieldSelectorLength, err = strconv.Atoi(promFieldSelectorLength) + if err != nil { + Log("Custom prometheus field selector count string to int conversion error %s", err.Error()) + } + } + return 0, nil } diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index c803c0fa2..c057f7c2c 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -19,7 +19,10 @@ class Kube_nodeInventory_Input < Input @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] @@rsPromMonitorPodsNamespaceLength = ENV["TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH"] + @@rsPromMonitorPodsLabelSelectorLength = ENV["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] + @@rsPromMonitorPodsFieldSelectorLength = ENV["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] @@collectAllKubeEvents = ENV["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] + @@osmNamespaceCount = ENV["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] def initialize super @@ -296,6 +299,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) properties["rsPromUrl"] = @@rsPromUrlCount properties["rsPromMonPods"] = @@rsPromMonitorPods properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + properties["osmNamespaceCount"] = @@osmNamespaceCount end ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true From 16936aa90a3950c878a9f5f9182d3d9db46c28a8 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Fri, 26 Mar 2021 10:54:01 -0700 Subject: [PATCH 081/194] add liveness timeout for exec (#518) --- .../azuremonitor-containers/templates/omsagent-daemonset.yaml | 1 + .../azuremonitor-containers/templates/omsagent-deployment.yaml | 1 + kubernetes/omsagent.yaml | 3 +++ 3 files changed, 5 insertions(+) diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 615cd0485..7201ee6ae 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -131,6 +131,7 @@ spec: - "/opt/livenessprobe.sh" initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 {{- with .Values.omsagent.daemonset.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 9b6656e9c..fdc520cba 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -122,6 +122,7 @@ spec: - "/opt/livenessprobe.sh" initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 {{- with .Values.omsagent.deployment.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index c25b9bfd4..4044c90e2 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -443,6 +443,7 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" @@ -496,6 +497,7 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -657,6 +659,7 @@ spec: - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 + timeoutSeconds: 15 affinity: nodeAffinity: # affinity to schedule on to ephemeral os node if its available From 12964be1ebf5e108f5861d82a5ce87634ec59912 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 26 Mar 2021 13:01:28 -0700 Subject: [PATCH 082/194] chart and other updates (#519) --- ReleaseNotes.md | 17 +++++++++++++++++ build/version | 4 ++-- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 14 +++++++------- kubernetes/windows/Dockerfile | 2 +- .../onboarding/managed/enable-monitoring.ps1 | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- .../onboarding/managed/upgrade-monitoring.sh | 2 +- 10 files changed, 35 insertions(+), 18 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 80d6f188d..04bd7c6e5 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -10,6 +10,23 @@ additional questions or comments. ## Release History Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) + +### 03/26/2021 - +##### Version microsoft/oms:ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021 (linux) +##### Version microsoft/oms:win-ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021 (windows) +##### Code change log +- Started collecting new metric - kubelet running pods count +- Onboarding script fixes to add explicit json output +- Proxy and token updates for ARC +- Doc updates for Microsoft charts repo release +- Bug fixes for trailing whitespaces in enable-monitoring.sh script +- Support for higher volume of prometheus metrics by scraping metrics from sidecar +- Update to get new version of telegraf - 1.18 +- Add label and field selectors for prometheus scraping using annotations +- Support for OSM integration +- Removed wireserver calls to get CA certs since access is removed +- Added liveness timeout for exec for linux containers + ### 02/23/2021 - ##### Version microsoft/oms:ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021 (linux) ##### Version microsoft/oms:win-ciprod02232021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021 (windows) diff --git a/build/version b/build/version index 2da3efa39..83a0a174b 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=13 +CONTAINER_BUILDVERSION_MAJOR=14 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210223 +CONTAINER_BUILDVERSION_DATE=20210326 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index ce64fd1ce..9c8014ed0 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.8.1 +version: 2.8.2 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index caf0217c3..4b539546b 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -21,10 +21,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod02232021" - tagWindows: "win-ciprod02232021" + tag: "ciprod03262021" + tagWindows: "win-ciprod03262021" pullPolicy: IfNotPresent - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" agentVersion: "1.10.0.1" # The priority used by the omsagent priority class for the daemonset pods diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index bcdc31330..76b8622b4 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod02232021 +ARG IMAGE_TAG=ciprod03262021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 4044c90e2..206d9a8f0 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" imagePullPolicy: IfNotPresent resources: limits: @@ -446,7 +446,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod11092020" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" imagePullPolicy: IfNotPresent resources: limits: @@ -583,13 +583,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod02232021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" imagePullPolicy: IfNotPresent resources: limits: @@ -750,7 +750,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "13.0.0-0" + dockerProviderVersion: "14.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -760,7 +760,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod02232021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index c0bebcc93..e4ace417a 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod02232021 +ARG IMAGE_TAG=win-ciprod03262021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index db035b13d..baf547497 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -64,7 +64,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.8.1" +$mcrChartVersion = "2.8.2" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." $omsAgentDomainName="opinsights.azure.com" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index a9560b5c5..9747d932d 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -44,7 +44,7 @@ defaultAzureCloud="AzureCloud" omsAgentDomainName="opinsights.azure.com" # released chart version in mcr -mcrChartVersion="2.8.1" +mcrChartVersion="2.8.2" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index e54822f74..1cf7b5c97 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.8.1" +mcrChartVersion="2.8.2" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" From 73548c0053c96a175a70dc2e7ff9e9ef1d0c7f0a Mon Sep 17 00:00:00 2001 From: saaror <31900410+saaror@users.noreply.github.com> Date: Mon, 5 Apr 2021 15:20:14 -0700 Subject: [PATCH 083/194] Saaror osmdoc (#523) * Create ReadMe.md * Update ReadMe.md * Update ReadMe.md * Update ReadMe.md * Update ReadMe.md * Add files via upload * Update ReadMe.md * Update ReadMe.md * Update ReadMe.md * Update ReadMe.md * Update ReadMe.md * Update ReadMe.md --- Documentation/OSMPrivatePreview/Image1.jpg | Bin 0 -> 120932 bytes Documentation/OSMPrivatePreview/ReadMe.md | 68 +++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 Documentation/OSMPrivatePreview/Image1.jpg create mode 100644 Documentation/OSMPrivatePreview/ReadMe.md diff --git a/Documentation/OSMPrivatePreview/Image1.jpg b/Documentation/OSMPrivatePreview/Image1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..04cd03ab127c94aec072d56c418da612b934604a GIT binary patch literal 120932 zcmeFZXH=8jw=Wt6K`EkC5f!CN6{Luih)Nfc-h{jgNDYx1N+1f-n}Cp)E=@!Tp?4C2 z00JUXBb|hz^n@Bn2siIJdz^dDe~-P-z4ycYu*X?>)=0*9)|1RR*PL_Bx#n->^w;SE z;F5ukz7F8bnKOX*^cUcC8t@2k_RN`ou77I`=NSHJOpJ^S=a`w8ng8kMFR)%Xf1c$$ zGcyZ23kxe7y)j?7$idEb@t@ECe91qL|ML|6Vmr@#{+|*5@7n2Cz~%D{uNmJkoDl+? zy?ln@@|jaCKo9^pbB<2!KLr0E|Px3ILbwy|||a(?CF>gMk2=N|wK3<{2T7a0{D6C0PB_8~n3@pooc zVNr3(r_#@5UutUW>KhuHnp--%u-!eq-}?GTM#sh{aK9#}@Px&s<(1!mR@X>7yLurkU-VFgzaA@*EY<*ax8MP;|S6TWgh ze^wvxV9&}0uKgMZuR8^F*>S|JFR0RPk*^9~Q;+8)2N(s9`@ae>mN&P}RG;~9>&FTH;s1k-`G32URtZtEOWbX{&A`u)2@lUh>Q0#P5*r!OK78sjK%Pni8l8e ziNdzgh0@Kp#Px$%a^fQ9LZBKJIDO+L*$t*R&uJSPzA{fyUTFKw1*`s|zpEwj-Lq?X z{oA+t00)fhDIkv0IjEbt**Lg)n1|4Q0eebLE2Lf{`&S0mVxi@wt~Ug#vrWgmGWomP zF5iNNeqp^Pp$^dBkjg#D1iB3|h5Zs__M5*E$ZQ@8>ar~#xW>n6JK#wnq!gvth*wqt@HH5%*dMy$!~)_t@4yfh)CFVm)FxT^Pi6+ z_sC0{aijOc!#>!{bch|?Z;{F*zaW)%%nQ_6jkWO+hS&pJpT<%fTZaAh4`M=(GPmfpGwd!v9)bs`C`EPy(uVny)tA&|W+R z#7vlU%OVS{D{NmHv=*tPeS81F+R52cpC@MMmLAFvl?7iwVQQ6YDbL{@YBb?MxvpxB z9~g~d+oXKY{NNoAo5ki!$p<-)pTX-oo2!!y60%MK5nc_u^&5UKO#KT><}2r{cD7(8 zrvS#uQ$V;JOyi^o?lLI6D89)GE~A>06G=~J)wL9l^N@ZiHg-Q4zQzfqsK_T2aZa+n z|H3V&&b(C~`}#q`5BsgTVW5HbYypOAj}` zCv&k+U;BK_N;|)M3OM(KY`iGrvpxz(7MVu=wCP$dWKS@XThjV8D zRT058LR$)R}XjgE$F;tsMwdrs`zKVv3I$~mv+ITQ@0L)6y8GL~Aw`}1Z|G6Srp7HC8|gTPuHrk#1!cbS=p@%stz_d^zFl z_yKMMLl+3s!c^N@^5&$ zPZArZwDT%HbkVv_m9F?X-;d+x^5Y&de2#dde)|+4xaFI~aTXRVZ##6cRQkHKjEdXS z+ua9VUYtjweR6u=aF_e5&9LU=zeZAb4!pCd=0)ow7IXmxkae+e;*s3sAwfKW2Z22W z2-KZGdvONV`CO_Wpm`njYq5^jM|6q*f%rs? zOwL|90#kZsca?*t(COnUJP$v($@%-1Ec2+AkVRsVeEYHYljpo6;Dppxs+~F??cx)% zl@iV^(G!Hp^^m({zk5dvKMCeHfQ>r*{zaK@P856jt839NZ8q*a2X%|*a zs?{6WGm^bBG$`GYT9Bo*-K70H`7^2nW<(L5UI$e(H50u~0oX8c(!2Zl(GW$;i8lSv zgH6-TPx{IOidgooiuH)3vvMV2KzN6Gn9v%io<15hDRmG$PH6Ha-o@UZk5M;de3K&h z>yw*w`YB+>XV`JL7wOyi!QDKMy|v6hVRLKK>=Ymxw>u_kX^uDM2me)!JlT%RrBHC7 zQOzp4qY3jv>K<5=(ycbAu{cFDK zejMmvGi{#gdkW|aP#b`dWd#$_l5SU%<25;H{igsfO(Ck>s7Q(cw&%$2NRC%oa>vd# zNTF^`IlpD{O4m76hChPu;6zglBkkvWRb!myJ@Qq7T+80hES&;)Na37bLt_TcGGkM1 z^1JTe!Y|L_zk1h?M+IBaiDxW7snNWIp>m1iyEh!}5;J0*7f@_dzdT6x?suk@(|sAW z#CKEv=AK*oaY69jYaL&ZFuX%i(GclDyRc934?OOfBaD8{)5ymn!uWM#u&JFHh{$5? zo3N$~k;S$r_mOT14!zSO@feZkAKFIsK_$i69#3;0opT?TF!0bH)JBPAhe0iCag}|) z4o?A-a2jioKgD(p$R^z;NL1^&KSq;n=cUQCj>?LmN7c^>%o;KpdHZFw+E3lXc3*$N zl7vx3(G5KO8O-n}nw>TcBt8T31ZJBy4DYcscMpWXLtaliHpa^Co9gdtyt$V8fM2Y= zoW?#$oXv=!bDv8N@%CVRJy<^LjzD;QIg@Ad`T1l1WZxPITwXu3JM}v`ouUlWI{Ao} zrp@pS8pKxqq+*Q z)__4s#x&K@Pl=nv>weR1?eQtmj)>tx7s0=Uph6B;`OxERS1 zq9cGV90Gv@9S6rSt2~MEg>h|keu193;?E9YJ4D-#f~7w1y+s{UR<>)QKs=o_RF_7` z!Ik)f$TZWYYI2xg*Jp!KtmZgtMHFv?#04h!y({72TAqI`th99&yV-3K z4m_v&<6}pS&)w$cYq$cz>Tjc=+@BWqHX65`U(K!Dv)H`#umU}=;L%?29S!lCniZ`+ zKt%%=S{$U>$>30uxb?~3VIqI>nQ4PEJ=uibb{pEyvs`Rx)6#1A#+2}~+^gpHUlTjg zf79n0LYq1T5Q(5kT^(}nDd0SgDg>OuSq02bVy1wQownF4nK?*0FBJSkna$`9HtvSr+P#_0>`1~fmmzJvkq)U=OL2|a zD-G>(ko2t4@g?foh5LQU z9mC|j13L}AjfXa{V%>|?n|Jeeo(IWP7W!}TCna%GKC%qbZEgu;VLscSi4iS$*VZsd ztQF#19vC|O_x((0F%j%9QiRtBW$IOwxSA(gxHxUKgTL%r&VL^d8Cy154XEGyL(PW> zkHVc%ze4c9jThbour5C7Epy}s>@S%u|0{pBW!S0QiB&edlJd6>OTQ2IyRTOmQp3E* zF$;BhH_6%MfrZ-(x*L>5V4?#P(FmI{g^81?xAq%w%ahMM_rdsDs{Q-o>t#`@%wGKZ zN2Rn2N(2b+dKmW!D(oCgjTBZ&;=m)`?HN*x&v=;itpxFL>3Va^Ke{Pkpr1G|#wE{Z zarV*kQvmgHQ74?28cMl2>52xNRJ8$##pK}Qo$I!97nf??n1Scr`=Xs)#khx2tK5U)+mThGZ!Y6X=u0yt526k?@_O9XRQ4?dQ^ z&Gv{`d%&N3AWYX*Q`%^YeB z(0>DpSt!ouRM#NP!fmQf@@S0if|F$*R}a~}S~l1EM6_n3O}E*;gSJIp@F*i(Pz~@` zaIzScuDj6gcw-)5@BGx=-gOT)di3E)#`Nf#jjZfmsyvMDe_r!j(zk;j6@_u+NNO>r za!S!y@=4S9p>E1sd@?Kb`8XVZi}*Wcn;ZEdZuAE!DAkcAp|<62e!9q;sGHZuw6q8} zOHi$1Y14UYb0|b+$$WdXxKea>a^R;4DhjS0B1Z8aqulKP5+8#m^YQQ`7+u#`scNTy zcOlptoMYg94P_e67L)oogU{;Oy7Z4!hDK(ewD8wjD~+_<@D6d!^RQs@DS#js3%mrA zoV!Q5h`HY0B8Cc*nsR&nh($-JG1A6FOX7J3+jMT#0}(542qlopg_)=eRURPVx{$_Z zq;oTt_rV$Y1XjE0?9~!yb8ABEN>ujHAG_W!6C}djXcJySWkL6A02!vh^u;O$o}o~^jJ=If>7n`=Ct#7Y6TKHHGs@c zmNd-{u(aB>8V;J>XgyZ;1roJU7gjMBFbkhS_2S`Z-WenD4w>`QGHth^qJt%sMa@=D zUf5Lg__`nZQk+XU44|@cykHfyLoWOjP!ru50*nS;@CR|sJ#zoFMcUlglT%`U>M7Yi z-gRq0{z&HmOOgBMX8;rw>0ENpto`U^I%OL5r!7?>sW#WpF0;{V4phGTLYv#1*MuY< zyEo7DCv7blUEo;a*6>mB5SmtT>EQ(cZ@?{Gk&DKrQp=zu!TO@h2@c{as)m)6}K8+RWuyV^aXu^^rCf^onLijXK= za99C-u4uwzeCFiP5;r5kX{5g$Fi*zl`xlb7i{}qCthT7T1|(gKlzdDBFAOom9-owH zN(D+SV0z@DhVaQOFwJT-P94kL^J3W+bW7()tQMT?KS zOhCkvvB%*K-HJG&<%zkaCt{87?sax?Rk&wrKYND+1fSy>8Y1G1o5-c$MF!34lX8y| zqaqOov!<&-Q8tB{nMjo*_B*a0F2n`Jm3+8k`ByN@f0-?sqv?YSFK?<(brcsg~vZef^pBdW^5d; zd`SKLsxeiSUrQWN%^+3>citw8)4eO|SJx0c3eK?yWSyPWT^bC%lDcwiy~B}#yL< zjeYy3R#sx2g@qAots*f_#+Y%$eN?Y8V?FWC|Gpu}rVTd!LY-hjo|L$*qC z^nuFFrEZAeynK%`4<@Y_Zc3dZho6*Y#Xr2;3N0T1Gc%+;9M+kYgUF0F&TX%CtWAf4 z;ps@Yp;WN&I}&IEPF(L5tiwdn?zrdOpjcr+=YOHIi|KJaq}jN}t|`+-J^lT|BR(#l zKgL72x54|)^56`4g!4xc9)Ln1J>y}kKdFe4%~;^uIh_afJ8f3QAeYlUK*eLJEh#k&%LOFC?o#ZH%W886Qj7g-Nvy6!s!Z=plA z!(4avhEJw}h)3T}0Up3#W_YX3!2;J~NH>s0JkE2EHM*th(P#TfrM9F3>!`*I<(qF5 zE(pKv^1F16VxV$Y$ zR~24*T(}*Jxm-+gKoQ|xLluK6Q=-a^DwdYDdQS~C>J*CE^p{rdratw}I!t`DP#USk z5zz?ez*JSpsPl!C3>5YiTbKUYocN(PB_7FW_r%>K=7=RF@4V93u)#puUm;m7b@HmbkPqiwGqZXx`*1Z3k7c8)nA?pdIgjkgev5JNNH+@z;KAbG(5Cx zh%D~1p^93)!9jK_^O!sP^JYd!@&)UlsrGKF3KvJcB(h6ygj%Dwe7)iQ`Tiu`pWLRO zbDFc}3KW=DiQWiUaw9Ygx8_4s#L2*j!0Xj0}S+5#@g18o$V(osazSU&h=SJ>3wTISZ}PL zGYWIsSA7jFB`D_sW0ud>fH0{ z89UJ$->lvI$pPAhT%yW>F1hSWAPztDMj9qZFnW1G#>)%5?Nw|nI%{8)#4qYvXxd+T zxiI;|#q_N)v(}a0>-EReRTa%a_aTrwDBwu}-6lcNzRHgJ@?Aqyck1}IuElZiGr!YLpse~Knb;^}K0lLO*~ z#=*4V+W_84p8i52Gt0*it8wmUBVd(Lh}8DsWN4?Y!z>&%cC>zALzk`v_1sAuP1u+) z9~H((kAyw2e7we``u-=4pXa>ZPF_!Q?#9Lr=tQge_Bt4avTE}@H7d^6vXS|xz7zGT zG{P^Y4J01JXE18zkTs~ds&vaUe8=~*a8XCF~p(e!Vz0{yHe1gskN>_JBk(i)?{1ehs$F%e#5&CIT9~F zwZNk^Mai)XC;_U{DAk^jXi5#7TeoaV9Wbk_N?I9VY_O4$aSCx2GQ0oTB{X82@0rar zu8lb$Sql%0ztk-l2b06kbM|^wq{xIxLHbb7@iV)<3(n$u#LwnpU3E{Y6HzL8iw z#R5|$3b{i0q=IB~TF!l}%kVcXm(pI|kmY`VCE<;CR^_X$W%Rr+Rk3k~QWIo4494Ke z>(Je;dJ^ay)ro}Yfe4W845~=uJ(-_8?r+BzW>B8gROi`jddg*}J&s+9((AbtDOVkT z+jBX_KMl{x3Xna4UPFn>DQ_<2 z?j3;|AJLZ^PZXPbaPOwng1w!VE&}VoPW(|zxGu{B3;kj(TQ@AiT)m52%DWL9H2JN^ zBbo%k`jS)@p%Li2s{u7@S)uJ)WuM3fYbB|vvIM+)sPPZ_D2 zjwtc6^?w_@$h_5OCoI& zGJ@__*dGypTf<|F7OlgCR~w;xygL0oQugY|h&h=-L>lM7QqP~N6!kRk`_7F%LHB5i zRDB|_)0jCh9Hten{-|6_|&{3)jWt*TqH<&eK(zn%qIe7-a3Ni`= zEhI;%b0FP=Gve`rY;ISR8m_pjSlXKN?t->n4*tF`;iiC1YsyK>YW5}$bs@6s(-Y8w z!=g2#V27TO*^QU3NbsCISJ&0_PMzU5kq-c$JFa{w+tz_)Dc8}qyW#c|t*c9shO-}3 z>kZ++=&i}s2DIl<_#&1jOyuYW#eps%_cuWnn2|7k*W-~@h3To#xHZ$PG)c{GjB~d4 z72k-&VZSZZ%68iu#{s#hV4_OrA?GAjuZc9U9(HEJpX}u&?YB((-c;9+WDSwYnB!*m z+?1WI4Vw8I4R3G#(@R&pNtjWohtWq@WQ7~=%19^m7uL(O;ARB3DR?GE>*kmDs;f46 z{NytWz;o6fye%r>FyZ1Ua@5_6B5XRuFOJ6J2Vi?P#y`#S!O^=OR)a5k$wQnwm zL3VAAf)&d`kg6WU`5*nPrvPxUKc?>&S-Zr&1@zW)+Jws#0!VU3eBD5>l(4^M;`e0LnFb{j9UtIA5X*dx!Pk>JgTM;7% z-`!7+-Sl)G7pEu{8ZxrDr14K_8kcG)^xw?@9OE{pNjK*4;u}ST`Af5pWU%*r>;xr+ z81x4HS%jAM&Sm-peeLV$ks_rLI1=`g6SGc!G*`~@WH?WIu)xb3lD`y|{VB@#(HBYe zfjbj%?_5`Y>Ep24K4nS)Y0%~P8Vquhid)p>CF~@Oe7OBm{OQFU`=Z-YjjH97ntg&G z+D{PC()haD4Y}xtM4fsMZ<&M+)V5PiV=C`dP-NKC+{|c=4~MAHI2sHr8vn3%XHnI0 zpn0ta^Knhx=Gz@rdvzZ5>q-I?!@5+TMzMqafQ^7aj_#l+f3rKsvo=q;PY`nIIeF-j zZOhFNxh6aW;|5|}g|oC-EP5n*YT@%4_2InDP|@Nwzk_NQyZ6J~yB9kX6Td(ccxL$Z zN1Dh(_;PPN!^x)_RI>A>KQe5b1}$%uN9@nCl$}REeoD!ACo=gw018u`Y8&fLko%{A zw+dynA@V38l&fd`{EtcLkCF2h2@E?QT!xVo4>+jUBW9>(5QHRdL;M|~l+Dgbvj>`; zq{CD{2YPuNZAK0qJeqYkbODFKy8jwtN_NGmN1gLJ<2)6R-y z3izlv(?4)+^JM7?dWmJ#_|OBFv6H&_DtY97CrJ*A#`|sR160obJS%Ern*@3?sxGi7cx|0m^EUBSedrh4bxj#G z?jSVlp6#@0CP%ghMC7uO7_pTJ z*Zc-7eP#O_Ba?6#nx~cw#Y19%_0eBLC2@r4_%{6lax@+jD`)jQx*MuIl^nO^v3bK~ z63P~1Fb@`qpIgWBRSpoD)XqEwRT4f z(+1lblKQ`|1#BS{g5=vLU2HOJy7Qde9v}j8@)>`fN1p(9;+CJY*4Z zyN4f(8>k^}En^g~sAfl{1>6R%G^})+XLll%#-`zyd%Hb`&dbHvS?8=F0!&r~J&Y9HH`N6 z2abH%E4jUyC#6b8A$X(w*ftqFGY);rntZo6kEQLWrxNN?Ux@dOQjPgc*}tbEr~8!W zGbTXueZr3$b6|P4=h0PZWGDQ(+daQ+x60!q+oi4M#e>3<)xa{l<-qlZ+$^_HiZ*>o zb<-3kcMt_S!-RfPUl33+(5$3#Xa#w_>wu=%Mhrknr+kd_km5ECPRdce-bT zd+4=&Lh?RvV|!6Xm>~ES#I&AuZUPxv=sqP=GM?B|E@c=hEyMD`Z8la;R=zp@TEo+) zz83vBdKNuuapbyj5Wu>D%U?JJaOl2jy`Ii(l}{U)uXBi{=i~D5%j=-F-AduAs)k)l z!3KN%rs7vtosa*vbV>PU{y0EG-u6Y+zBiBK!vx4}H4*1n>`2#0dhk$&yp?Awsc2C< zzmxsOPgcW35A6N1Pos~AWv+ylTQ>CsR0ise!A1<`MO<0AehRXY1xNz~rJ+r}Vlt@v zYTaSh{t_Jsl@{ni5t68AQ*upF%xIi|>J_XsrfPBD`UI7AhG&S!{`9c5cXv2LlJ@Nq zLo8yFZ>ZI9TuZ6FF5YG9<{5tesBm=Ai)sj!p!*Grbqtdxn=Oov?(1u9nrM53G?3kI z_uAB*s`p56^mTlfs5@}Ya7X8j#C82+JxX zt;3+!mxfUL^D(x%QEe7Qe-H>A88eVwRMneD!r5=nG9%JWnZ|GK==k;+Y2#6`Q{r(i zM%1&LrMK$WZ6Kb0Ck9jYDkgbX;-parhLK7B=baLimjqzgA1Z9rzJfsJF3d%shv3`z z;=tW*2Z=TkQ#8^&mE8qlYGHsuML0Gut<}PN^vGX$3p7eVo znJ~n;PE#6#X$KMKEznu`Hv2D+gbgh05x!(Dptkq`}>oFkL=w;PqevBFYgG_v7m5HnhI^i_7o7Ej8)rY)ei?< z>Vwdu%)eI7RZUDAM=}_)Rx{n?j}^(-yJl4kb0QjJ<)Y~Ib(jS~3KTyll=oX}cx(TP zBjWK$Tl0-84%(L8Rrf?6+*4X;2}2YFYOIjJy>Xhi1L0WP#$R+}oe!GOTKBPyFju7Vyq$ zSH2wdX%K4EJ5Y3U+}7Z7iNI`#$X-0`2Q}$fW_x&s+RL1XsPg;L++Tro*|eR|_bK-+ zcRy&f=de0?7bX#yPjHD)w?743u&kTfUx`O`yFmO7-cPHfiKUm?Z%A=|$Z)+(+q z%3HXRo>6V=T&OvH%U`pI$D&bWsd9EM)dO12(nrE2*3C!Fv5v#EDrS&*X@ygUAy75H zB#E)wYyRR4Z&bzOCQxE0O~_B#2aY60QgLcm$#b`3b44nHWR z(;07sefp{^@K!1;k>w>LBklIhrQ+r_;dB4AQP;0)|Rsl@EK_RQo@fwg{JKszDJNBW)oX| zj&vkTdHyV0%OMFoQpy!`*{x!mVTfSyS>%o0+^j`(S(Lw5T8r&QVh2M)X<y{iC^hN$kL&Yq-e%B;pfk!>yMv1MtMs{&VxyzTQGB%#=OS8+)vMw^$U^!}KZ?K~!mY6yV zyLta2cm3I|JvUnw?`4y5!#^z&g=nYa*Tt`kaL_tlQW|o@0rK~WF2{bnSY=x4vca)^tBS|8(_DL{KSHqHIHBJ7#yVP=IFU^^ zX59`5hkCI6x7u2nAlN|Lth=RLE>8X{IkCv1>s>%>?uYVFW;_SqY8a5%9#vmBKed+i zwCwE6l>M~c4XK!4P6{bwR&U*cnQ|%CFvC{jhDr+z#7t!Xmuf^I3Z8$6#wrYYrhVPj z=bg>?Szz@3R+lo)u}%orPQKd_Y}|!nwf>XEo9j}%de!rxfSsZBJMRbWuen1~^|$oJ z)9fdwI(sgap7$Bkcg3WaS3)O>W+z3Fd0z&iMKUCPbM=R^vLRN~zR4Zg4-;kfjU^LF z+oo|6s9epMWtsxjqJ}I-+@gTmqC&dPB(WMwd1vkml-a)wpSm@>ls?Q9c&>&HO!_?` zc#$Rpdz>)uPRBGl5;f))dLI*e(w&2N-rVGg^N9ae5|q#AoN|jn^UNBcf2mSyBuEAs1~`Ez0&46SDr35GGc5QOXVUNcY`jXMOHO0k(Pg? zRoakKK1FxYBu4SdOgLUjne|aps6%$2H0c>bNS3x$v zNkycZ6*ZfKq*<|sqwgd&-@soLK1LAR=hh_+vPP;x;k6);&UB~!y%e85zGDY;#F1l+6b z>^B7+_L-cstxLy6=a^-0{ zGdh59_}hE$dEMdBkey$BTJXIO&Hgk3qTH)wb0;bB?v7=vid5=L_e{F&Ex;6ZsJY4 zHn)1L)2f_$OnvXcLVC3Fa8<&OT{`Y(O@Dz%7o!_Ps6K_)9Xb*S?X`W9HCIZ|;OL~u zqv;S@-Dj;K>zUF;NoR*O1~O6?I(!b5X-)|vg2ETS_hXwv=6&IQA3%ihHq^8oqV-DO ze!wYU1bGT5#&m^(H|WY$AH8W@8rC<*OWZ=1xmiuMISApnwBiI5)K3ALh8~%P=Zjxt z&Yg=nYyRq)A%H!&j`#aSu$LWn&q|bKeZeRof zLA?ddyE}HZ>&Xc0c99Lpc3Q6i!T0L6S0y3+laQ2+uaLBXHGuEEM=>!Oi{Ycnk?HtJ z6#`q!t7*mReM2JZ%5fmj# zhVv$tcz`(SwUDkYe(%rwt=AsdW!uPHyLmFw$3FA*esW5`@|Clt0tfF%j5LX#XztY= zJHx`jBJ2&x5xK%q5vM%DFgVw1+ayb2;9aAPY!UF@P&3PiYJ*EA(bCOnb&mX6 z<@GXrcjFSXTT=tlXE%l}`mN!guPkf2Hq{3im}CV!y^ppKmnw1QMF~jR-xz&fnyjf3 zXopXZr%HWXZ>bBo+L(wEsTu}P-M%*WJtdQ$zNhhkJM_0?Nv8Sr?;rR5iI`3M^1x9# z4l@}oeG2I0C;sl4yOZ6iY?;LuQI8bb)`{wRf|C(VHqFviu{trLfG9V5BK{sjImCBx|jacN@x+ z_Oy^-ZIa?(k*pRcrGG`5>5M<)j?~KbTfw?9K`x-{AQYRr-pG4{V;s%%Crqra2Hhq< z{Y$Xk{pA7Zc8qx-iN)X-qi2FVrG9C?W+|7ULa;v%d zSfx<5{?y3B#3zQ1>sGFX6PZE>W$?7x3lAm&u5(DVI2yV~}$N{N-Aj+parrd4ap{ zm8;e^_U3g4kEED&SpIxls}QWsm^IG*oQb!L2VX@xPuceNU8$Yoeq~H_J=w{yl#7hV~OdC5gc;LjGV6eJP(t zTQ)R48L3m1G?MwS%(LSI3X7jO{4?A~{(bO=>_Xh_zFL_MuW&!)z#m@rP-$_A+Sf2M z{FyJ12|*!|OZ9b5rq{7M1&RFj2W4!*sB2+AHMz;5P<*+^=EIe>^&U>o;mtkd{oj^( z`*$KsFHiLf%d}h&o>x1=85G-$^Z7l1d0~9b^#}_cCTrO-+vG2Qo~l;4LbJ6*1>Q1qIW~o0x#; zDSFzZDd`I0+4R?^0)zbp>;+Bp^_rUOf=rp3-$Z7}jOQIXqca@yYUrA(sV z6j?N9xvl+k#FdT{{kMa(|9`*p*qKep_}O{OEMt|cYdtxn-ZAcYr_DzkJ!9sen8D7) zALNkPQGqwWk5_LaV}K*wm?`x%9;_LvuNivwB_Od23G?-)qisY};YVNSNa1FeQ$Wl5 zHV4^w4|vSFiXmUhU(}_F9}AwG+ncAJHzuJ<`|b_g>z8QL1lgss?So=bW;S^H3vxaL0UqP+3y_B%2_18XlwA)@2@Vj^8zd1KN-hMn+f}%qF(E5M;z29 z_hTNf3)8qxIB4O=5E`R6nV$1rphceov_e%*0cW|W9&CqkzE6IqQ4CN((WB5`N1Y`Vj;Y_jIClx`{G$&UGp}-n0H_FHs>BYSfr9GGvlE zesybv-+vJTm|BRbolxoIr^Q2hwwz$NnQ26<}1D~ zxk(IuzQc~)6xV@1O9Z_``nSPdy9)ngOk7^O!p59aEfjl)KYB2~Ez}4+4O1hYe6YRx zsKAvZ9>w=6q0ISeqA0BBqRdy&OtQp>PRC`@VDG2&-24>q=oIh~O;0A@UZ5$$|AiL% z*>RLi#4+ViRmOvklQbsS-`^TCA-^E*&_-xBzgxn8cV+nW`iksot8c9hzaJF&*(IDd zpl(v#_e=*9r+Frjt5=X6sm_lKlNYlwf>SRyh#Ep4U3~{QcP8xF zy>5?x{q28|gP;qd^ay;&Y_~6APvs%{)8$552%?JOGuwC@241NH#Sjc`FVEeq0KJl0 z_GDy}ongBu31p+P5Ou?;e5GWFE)g2(m(StGXBhgTWYX=9mb3HgnxJyUXI~ig8T0Qs z1EPMbuF(IH;C~grATj?Be2;GwPP_%|iEo`p9eG|?4ZfC~gmIF4H_xpkFg?UckLsWV zsdj^9If(ceRmBA=I<|Tx);Xm(0pXv($ma7f!R4A&!N0FA(@(Pei@g7ZX8|d>_0r{! za8Mb_!G(0Vz*|bn$Ifw#DV_rO<^s)Vmkr~~KZ5<%Lz-PuLjoFRFs@dK2Hn!@yE;d^ z#!r0d4(mVve!myq7I*eFkTUzW#fL^3S3O*Q{qBC@Mz~FpY<1lOdyd+E%3JkUg-SahL)r_C&8_aqD^LJ?%E$95U%m*v;n7u^pZH&ly6As4!7+lo zk~fdN`-o54Y)uu}-8$GH@%-m*&lHq{?*io4E?&HgLjV=qrbKMS4@I}>Md{evJt2;# zm+WMo;k>Nq9eosWX=XPqubzJJ>Obepabjh<-0u$T3`GT|Mm7#Rbl@fOg;SOMMRx4f zrz~T^F2aLCo2d_n3sW~fGIs#Vm6INh{Xs)^w5R%pS-G`&XGy4!U4 zhn38i57>+Oox_pB)kw&H=)?-#8WH%$*`Lr??K~;};&EL6owc1m!Y%Sqxo{37L_BRA zZybA>?DE*&>kwP*2Aw)Ht~v18p~IVfy8BLZWQMBdut=T_d&5J6Dzhr<+LI)-w9RZF z(+d*3Gg-VV8`>~3NoAJe=Rt<`Kba>f91S6U=xBO{mj99a4~a#6JO$X^y`eYi6R)p? zybqrIlvgR!`e|#rwN*3Z#Bv~L*+fUbAunB0)Nj=Gwn)WDjX{5Z?{&&O823aN`xLna zaw#xr0DIhNvgm=mnsCmUdp72ZF9AH-&2LnUdhUsU*@?Kdb!rwmgwxzM0jR(#F4tlPbEuxn#Ka%A9X_SHSh|JZ17S;4Q`gGzhg zlOej~p^Bl;1IWPqE?ncWnQQR7DVeqD|BJmh4`;hu_r`VBs;a8FL~CqKHP_t9Tuo7P zNlT4ssG5S1wx*hkqKy)zsJTkaqzGE7YD@_tVyGb`#vs!7`<#9DwV%Dud-gg1{N8iD zzjyzU%Y|fReb>6zz3%Drxoh&AY=2&poAHVC5WCca!GA+sKB*Ug{@*dM7 zR(XwfhJ8)if)>zK_AW1gj$-0MuWP&ECeJs29LTsaI5$#pCSb7ihA7{KpE2#qKIv}b zl-UBCDQn>B|GT^QKb!;=y!DliyWs_6Myu|fYMZmh^FlIcw%eY$rkGmgeCc3cHD||I zpoILtxUJSXZs99*=}OcS1X94e(xu?UcWN|B1e@DST1HYW!B{RT)wf3SEUR!7`1SlM`-fs}Cg@mDEj6S}(yWVDb&7 z`Hf0UZUi5-;xpycswp)wvXCCu#@uqypJuzVC6?NKEwF2fKKqh&_Q((P;!#P)AO z4dpj_%_k^wlmdtT@tcd5{n3xFU1)+>?WZung#?1fE=S@5o zVT*#PT8hhjeCxN$Sq}NISbzL4<{J1w>6|{lUFu+!vGG1dY?!fq#V05LbY|5SI-$nv zg^#;EX@74fjGNh~$~-~~?Tf{fxtY@f<5y)9QWx>97wk=(T3dRd-h7;`JhSFkkNkY< zF#aq_uk`O8?C*|8sRAD=ZNATPf@$)}f4S_>g1Z&7g4~_5hn>re%9fy?DvzE z2lxAhDstCW0SSK-DAe%P1M@3Cj_4iAVL5Q;zq+j1^j!A+_w@+@;CB^wn~X?}9DMY! zZR6obWf&nPvq;KHnV0Apy+77cF`HH7<+bAf{Hbm$`XKtp-{=`V>2dy6+QP!PMm=co zV5%<`=}O(WVk3hv0j@kRunS-3h1gjykyknAIIeVmMsjv|GvxNOWf`1gO*GFEYNU|N zA>e4AsXAQIre6~<8!i}PBYN=41(^dP2bZtPtz0c{{Igy2A6$zcMW|^{y^iXNMzdJ& z5hID!kyXdSEbzs4iL)Vy)8@`L`$kxlT;!f6Lzh~MTUr(L$UaI9 z$RcZ-Ie4q=Iq-}L-Cow2XfsK>CLNCPO;;b&mVP(PxR}4RoBbvPS@6?w(!B~ZlBM0z zU?dw+CWsrjiQ^J{J(J8Pcij5HkK{Z5g$81~net1phG@ao1*$@}V`C=O2er(mAqU{{ zI34kRt;#~r-Ru}xVCaL{FAtNxqHf9y+g(|gDZn|S@31;8S?^&LQjRe!3b z5uaAM!=mtE_RK8<7`w-QI7^i&2 zoM<{S*Dw2i$d5i4fQy&hb3_n!%#yPaoq*!|7^v>Cc3DX3a7>moxf zrzRsP%-=pvsStNRoBCajy{BNJOfCCx!b8W{LS;H0tNxDu3A-RjKqfiZs-3_KkE|!Ws)(jN+9Cr$LE_o_7oAS^kmw> z_Kxfe^pL60kPbWRv@KaTvma$%7&hIDJsq>(e?2r!G?5z-G!nayxAY~IX>3EPhh0B@ z%D6j7N}n8|O7^$l&q%$xZK%Le z-@A%j#nwz6{>fF7!yf$4tylL$|DDT*1)uxr zpPizoD!=|UiGOpj|Fsf-4K@60CH~Wuuu~O?5XjH%gDLyA?GVHs@!KCeb~q&bDi?sm zV)@tWzrO}nUhk$ep}d!lr*++FK1RL+ueE_*NoV~od@Wrk?p~{Uc>|xnc&}(m%lVEq z+%Jwq>;e#Zxq#gK04U)lW)^Y{NY>lc?@E^}4(*-8+7^xf(j{s9W+{X);(oJ?0QPdl zE)b^sg5CF=|CI;4=h%p6pawui`j>Hjv;5;gQo9(P)=gB__wYY`^v^?cKklDpED39#&^SV-wy4hQ)N;A1K9@b>TT=}?{Ahezz&6MEd6FlM(z&x z^&1QqZE=(it^AW=%r-qk!OvX4x_>5_ljb>KVy{@MHh$#6#hOM3n|o))>ZHYLQ?a^8>*GN(M? z-xMPgIG*DkyYDsL2F0u=76->x-V@JSPUngdz4Ble4}AcaUnZvoU-_=h5j9S{^yO6@>yaS7F#M`l+8H?|BZYuaaKmcUZOrj3Nylo1>+ zQ?4xBo5!ujp{YHSa`d7HR3N*CKDAMV;3lC@2+8Obgho@OLq1FrLQz2?og(oP&X(Yb z>XdUj0r&yk(_sQO48u@XqDdexLv*)VpcnhHyK(Nt6-z`>H)3Ao{>F^85KVB)fJ(%z zo`E;s#Ws&j*U_@llO6=utlum;8|*TKzr)wAZ|E8{t~34l!@VQw5*IBVM+^+C!-B6| z3-g2^U+1exWRCR5?iAJhW?8dG8m?;xl8<#|>2`%_Q}Z$NZMNP`{O|1H1+rGAymxHm z-_`M&2)`X^qgFo&N`I0XAmLM8AZsMakLJXBha#=Xee@K8>k&nYw4{fWGJO3Y50s06W;CanKqWC)!Wjd zx03M;p*^-mC#>KQ*F=`8v-ZLkY+ ztVtzrtY>=l;I-)qJ-O9{1oYEQH0QX7LbL zTR`(L*%rf&4y|lj4q)sL6oP$(K0|w-9%%gbQs*8y{r;0DrcMm={Y+V=*dL?=sz9w< zV8rU+Z5RNp*gFG#=-~9_i`eN~uFtJZOCJ)%8}%;jRKy(PgQrPKEVyS++7oEA&8zv_ z!|FNHWqpG*0y2%_)riS+G?*fel}raxILPdRrc`C&qtxN@O6s5i4L`O?5G*sIczL_j zK|b;2W)p+CUxcj(zwSQwKJ&8)5Oqg-_MKx6C&OV{9K4YTx%oB$NGyhs1-)HAeBMJo zzB^sD=iC+!raG9OHTv-+vH^xmGNdc)U4(~BIhGYndZ?j>aVSWHGV#bwd5OfUBLVD6 zH$-6xMz7xxece3TNY-z6Bmv~OYYY&7_{lh5h*1C0qU_k03l9cy^p()P?^0(j7LDEz ztyZo5%`zqzf1>J38`OqJ&lU}xp>SJ{fiqkuu+)odj8YQPG`rX?B*ok$H=*U%I)Y$c zTVXmZbYp9SW;y?id2Y00Ds)eDe-%|OS^Z_QSlVR z;p^v~JKPE3SSHh?yU@H%x+H+84;rk4&iRG6^B?Ek3JivAp^LMw3$~y1?K;w3(&3@3 z&{Ui%m3A}vVXZ_x7(ixJ9s-^oV0lz;0++*YbWR=|6svUbZVx;?{zbIzKsk&JLim=! zGpCpE3?($1ldx9yb{8Gd!u6FD8fY>kL9>p-ys4%rI%oigSwH+vkgJj1ihGK3Qr{8d zq69Nep=2{_mP=-@V0|sMl;deoII&uCKaL4R zi~55Td_kv?-p$^hf3sB2J@050hY+QdKLbLo}sjjoFWeD%*|c=e)dbJU)8sbCT*U zQvJk0rSd9W!Y$md?~KQ`4?S<1W6L#pdXv_P&K0^1S|-Sv^ai=OfJH;ZDnzTK8)4&! zDq&MSag{5+9%nZ<=1uo$bL-7D6CQSEo?e>OWi!K?$OR)r%Y54}Y>hPz8JjIbPbGc8 zK%!d?(=7tD51x&@$B=uYl(py$e)b{kl30@ z85&0{!ReZz87aSzZK_B-ab`p!5x^fjd5UP<)|Hg;Ad#h(w8x!VJi33gsQGWJ zU|yCO%kE`5FfZEAlrlCKSNamLE`3P=qXN>c?|B$#L>8cDn?R@!wa9vA3t*pHJRa+b zql7IUX^g$vbJ9uuy`_Spx{EmhuW676Z``8NGo7jJ9caFdUf$31B*}0ULYZRdrFn`0 zWT;G5-ZZOs#y9%(O~UY~h&%dHp5npr;0@5niik4i$)9jOPaRR_*FnZTEw^)>)1YdV zsiE)rv*~L!J37AHoj(XyCeWzLO)OoK<2k19%wqi5d_oR2s<8whgz^kK5pAJI-Fg@7 z!lhNg`^xpi+A~pHs?6bK2J0*J*=G6$3Oq<|~!kRCfAmp!+X7G!E;Ukj+s72_M50WN4m-4|dcFE72SaeI@dWA$(17S6oy1YpcT15+fq-M6cVv zIFgk$@p?PC-sphPwJ+~WCWQwR-xW!ME(m?|={$1AQ=yTy@S?3y+I{%!m&Bg|;xYTu zVO0o=!p!~4Y5efSDQyGfI>piQ=&M1u5(hQY_fJ&Ty%}fO+IJx_Kq0tmpDjHI$o<}K z1D|!spstQq?pch z_$?zYsYF|G3~D&yX_0uq?HSmmz#}v9;fuGmy1P;XFa|wD{t9K};RP^BR*>W!#RS

dj+R1b zA;+g4mua>mCq)w}51(a9IsD`kxM1>-1FLrzr#vf{DYD&kRUm$gz%&S#~? zg>A@OeSR!mA+-l?zoi-1p3OSMkk-Y?zd&_Gly8J2oaoVSsjIyFYipp>>EK0Yj7iKb zAE6gs`Dm7dftM#73rXNpH>QBS_GmDzGTs_bWu}$EpkIfcJ%xy*u=~QtN0jjYK%6q6=3n}i)L_7bDZ?FszrTErQaso z)>qrQ#J^7{zU@|b<^Z3V^ck+h_d_XNs9rqQIg8ighbbod1;R3Ck(1s)?yLzei?1p3 zFC=~!H#!VeK6AaFZ!S%>hl{%NeTvUTUoqkCZ0%ycAfi5K-=X4DP30>~)rP&E)%f2m zTPxs{u)sBK@2y>$3qU$gfigKK1b(wv{LoUHwx*7cX;-u-aeAuReSAJ;KQZ1_@)`S} zR*LCiW2b)2=IDa6hhgk|92N?WMH#fM#)c_bWQpu=JakE+9VIDjr+uE3*&liOEG6V)4ql=ji122G^<&b2fHo7*a_{w}6Jjsl z$U4q})STY%Ov2L==RUhV8SsRBLPq<8qp~Hbr5O!B1{u~B*cU4^wz=zhbP@;gu~GfN-c>r5 zgpSt=0&FtjCG06#hynGRqvXIQ22($%yvH|#f}gaGlf~uZo*3}2n;SAd&#;n3+&aRf z0C??oC-RxF#t*IQG{`-=8!r1~W9^VdYePju*x-?ode4I@QSVo;NNuf!ZQko74Y8b1 z300$cZ&^^cJHecbW99ilWp+eQluAwFC)-|u&}aZ~4y&z?^Ml2Z(a`6F)R{YX;^(td zxA^-paUf2l7rnj9QGBS$QZ+#*ugH^EHBKub-%+sfPI5c!9OuPHqNP5UJ|+A-qx5B% zwYw+N51`Rg#WPKyn^@~Ww_a`)@d@OD0+wmoyDMfz_u0D&&+8y!gfS!sYinv!G^~25 zF#^yUR=@KatQDd5K4t8kKa=}_3O61wnt~WU9P;2Vn(dXoO=}uz=^CqbH zpuee{MxC(D3w-ENFZvL+ z_UJfWpV%%C9nO#~S>(+?&7e9cn2y=;KMDt^DBH7*U-Cs?K8+887}4XV zkaU}DUJbXcmil+Di(~VYU_Jb3d336lZ%_yo87P@C`=xFtFVIm5CCa>(zp2!sj_~Zq zx?!UZ+R zJ{)O4T`43_wWhcHL~OhaAcp6tZz zNV13gScR`N$tmIC4s7Idn+o4}N8b;o;(jVZlZ+f~ze#@(7*b}C<9;#Uz}x3!KU+!ll&jV@9+>14 z$^ME%HeUL85tr?1az)*urUB}!v5p#6SKU>8TSV-y>x5{+8afm5uAd&wUre1%aV#?G zWLoRE(%dMLWYDP6Rx!ZKLOi!H3Bosl3*n+|C|In!f`fruv59{%&=P*Rf#ppv+Mr5Zu=ZS5gH>9HI~p^m&&O&)f7>4VF=sGTUA zSZQwq7yp-!BRI_rfQV5qRkbjX9EawF0$7#MRh;&vE|cLEOsK8hqfz5E!gyQ10Vr9q z@u;xl&(;LPt0rC>fG4ASc!hb&Zc=BX8$n@+QrMhZOz!!bPr`{l>p;^2=^I;7`-M8na7_bx$5hWw6Z?C5R6D7x zFAPotSk!Sa=3BJc$*_of{tWZdl(nJoXB-M?*lv*M7Qhr zNcJSUW^?ad`Ph{@;}uhrnv@yF*do}sXzZso$|WOR@*C!$Vr_D&jZLryUm8;Yl}#u2sWS7bp43bE;EJ zJmQr~8*%2#kHyU)Gw1h@5}C(`b3B-=RQ)UuGBze4c(wh*Earz6_*9;!N{FU9CuiyF zOlNP|mY=zTE|Ezqt>2d`DG1#?nT23oT;DNx%PLM-8V^`tM-~L461;tj9P}=>ByQQb zpp3R~^^bpuK;)e+rKpnmD6od{eYuS_fB>%=+=#bGvr~R$jZ4ALRctTfT?(?o5VS&o zy(zF=hmZb-Z0P45y$)9*Stp56CpJqu{2J2BoT}{g2W~RGGfq~0R zq~*@0=`Fu%kXjEOJJ-`chE&V1@Gl(~_&}bt+SslS3qdr?Fzzt>g+=!>5eBq%s~^fs zZqaw>%|eMKWsV-T!C4uRlv6PeDWN_v&zl3C{Mz2qBS-hzv(*L) z=-%WK=ffGn9V3|~s+kW$JlSB`-5$4+OYWcS(t7s9hCjn=da3W{$N=7q=CB>z6i7SN zx|>zQQH2mP;gzny!0H2OJE9gyWnwNjeK1 zGoPT0lPxoFAE!jnfi4S#`6YDlX>k0TPmBNEag>(S)B(C$TS}sW0JJ0|#N|o=L+< z3d5ANFL%=fDrhTO*Mu@`y?FLEUwrNP6X?8Z?8h*u4xj__baSGiczt3k@BwZtyjxtPEq>LER%u z{$9!WSSpc{MdRpd;iCsS=Mm_R9Gvuu&fVJZRd0;LA@f1XfToEw;q@vn7L+G(S2La@FHCaoCm=LQLMH8jdA79Mc_Ab;7ZF|~9k=vlE@fqFZ6w6DX>c}a zf>M{9#^p6|_T+_#GC$Is>-6*}Opvp$9bXg~BWD(BGAz zPq~jpl}5`q3lOB%6@V};SmJCCrRHrXm(8J$-oXn)i3>QI#7!j%T9s%fUr>0D%f z*>30xZ+ZZql%=5)Rf?3^9oyT311hy60(LDXul{n5A=N&c8CTU8ShO|%@O{ceVYY&j z7wM%d5C>$K>a7T?(wM^TV|wx~GS|VRIS8FYsh{!*^P5HeEP`wADnp;TRYmRG>vC>A zc8#8Y{Ds|uRTTkDic2o&`4G0XXgeV{ebHNy$L7A2G3d2xT?H#ZZhlh*!~qY-B>)&B zrA`6%=x~1!pY0?Z**aE};C*4#=wLMu-a5tTXwkXcIbFHAp<+a4EG60J7vSUTIk{* zZmoh@ORK;->Ss4V9AxDlDMeSs6}4n+HV~i=4l4c?W#kx|B>9gPccVQd?ygvbw_3PL zNP;7jUDITMp-Y-DHd4+cG)BA-%a_Q8UB7~Rv)|iv#%T7msD%jY>Ql5(m?Q)6Nvemd z2IoTA%m(H|eJ!bSjU+)+3vgCG?_>KhNO}h4ajE(#F?bT1XQ~N*lUUi?QVF;=)V>#5 zNE%y*fCJ8IAwNrhy2DFLUA!qywz;G9hh2v#_-qW9Os%Y@tN2$#Z|FZP#X3lkK58+y`;I&0!f3|B)o?01Vc@ zP>J_v)i8lpv!qw3H@fuDVLw;bU7Kyog$`AV)zTBrlR%&S>UHG3AQryEHfK2d;JJ=E zDo8X2KE9Ov)`5+jU73S8OOwRu3jL%b+nJz2zM1+^vu&Rb#^6tuI=kR7ViK=jYqNmiiNn6)GO90|Hp84w|a zE-7g&`t~!eKvLfsfM=x|61AdbTmJJW^bXPJJ^Z;d;$-BG7LexGw{VR$ z$!m{B6ZeWq-2D38-CAFRm3P8cyzVwgy)(A<0teY*?UkAP;)|pF`=u*PL4@Z3;}CiO zLMPFRCfg$s8AMv@qX1G6z+Emwm7{!AXaURFKkT&g|H4ff{>IRvtC1iPYo6wC->ZYl zaS5)Z@=d|l=MR$ayor%8%2yAv&t^+~_Vfyiwdrl%JH|jp0y4aTeRykmW;}q2ErBi~ z@M;v0+Ok5~qJU&4k`vj{;y%+ztxZzlO1)5Ao%DY3WJyxZg`zdFPOm|>s5<@i5Xfko zw=lp-bu=Gnb8`WRJ`3C~;lxn8!rvCRWZk#D=dj3fGIJ;Px%0Tyq;_S}B>$Uc3%egO z2qX9O-GsD!24^pKcZctZ?l_s-pY%jdvW_PfcL$e9i5@;ttM{Db01iJ_lQK#Vxr2(Ig$7768NrqVN7@NdQsfAir7q zI2fVG&c+kc-erAXx)xuq10i<0Ud3;g`DmsN762Sy04mtlf!YBOgjU`bt5d5hvin}^ zyDpOx`kQ43xR#PC{y++|0{WT(R19?C8PJcpNx)b-MI1^%VkpAyHwzPvBrp8|m6b*Q zX3^56Z339xWvRR+-!+cn8{fzPfRqR5u=>B9melSzBmDpJS=kM?)^e$>8snOGzV1us z4sQI~=Hc~#V z?uA;N4^su-@4$)seyL0AYlc5~lc*bQj)@u$6U|MP~Z z{4_HdnV$jOjehgb2Q%BO-#IpXe$EGg=KoD*lKvzn4v}YpFTA#C81xqTXNk%CvzXu? z=z;9fFGK$|tN*#RBDdj*|9+ZF#sZ)Jtb;}$FwnFvVnB0qUsYPTL)i|iN~N>5iJU0D z%tE=H8#||AxKd`mb?97Ny{T{QBpXUq;^-Oq`(5lz?m;st& zkGl>7GB87S+=~2O>D&QyNU26y6guDh0h^u29>GekE`njEytqn-NrK#IHI9Zg{$`QEf);Ey-5=c+wFZe#1Q1>G7z_g~qHEEg{pp!g?8s?w zq%*ZECO^ZVf5RlfY09wdzLoP~kPdDp{a5{5z5Hm%)+apnq3$Se+Xz!Qnlb_r)Cwx; zr4Dp2$T!_-IZYiim>2WCRv5Ic)O20-H;aDETGS)WE}+Qr|&iuwrxqFqp&ApT8=5DZRiZFtG`mp| z{58DGSVJ|RyNH@z+W&e*&W}8k%2#)?(JUv}7NKOq;_ z8fXPY2^Jg|bk5A+TrqX_bral{^}fDaCz39O1fnCO-RtDUxakCWlK)mEz>ct)6zA92 zt2jWnTdcb0XmUUiw=FmwFB5{uS#EtGIFS#bIb8>kQ2{}7 z_!7zSP2M8Fvi2~UU{eR}8Ahqp-!!zk`vQlkvpmTDUgW!Rh7zJAu$OKguu#~N8SL+9 ztcbEo9lbJv9JfSTei?8~8s4q#L=tY8(>3=D8Jg7DiGtS%>1oj6C8qR9;b}ws%bF~q z(%6rjB6r@KO5Bj5+dOA0xB}5al10pP1T+`A;3n$Tzbf6yT@DHG4BmfBHr+MX(bz{QRhnu9|Inq6U zz%#9)){Z}tyXpC(33^bzyYeaj^(rr&2dQke)>9nITm@mh_>mf>5Iu%w+ay{b*1r(O z)ijcuX*7_R%zxMiBy#Unyu#Xm(Zrod5kwx(cEEZq^N5;T3+=Iq0aQB#XK|;B2;P^t zw8We=_l8ac3yV^fwr1%Sdq!~kY4*G&5F3J>>>z)&sR?{!QA4ie*b9cVqG7ITy=}n@ zN1PLJZ^xH_$2uMvWAOhh?>g}ZGEwVKB?I_G9t?9B+DA|N^$Z`nQBzYfp zGB?ypN;0TiJjJZ)yZ&?Wake8uVejb9do1q4$LQHw4=U(A(;!?{FhReL%0&B2ml+=i zlIqKDmt14dA3nVzUwOz#{>}LlI$oH#DS5iYY;9Xo1j_n31L!b7Mn-QW9JRGJRUPY% zPjYdeI`A;pUQItzVU5=+?a9^Db9a>o0wzA}dR&C`-6c=_8lFhoYKnYoN9fT< z->eC+Bm8V(p-;|FgIm$3mTFM|!uWNs-nS3vN{*$nc&^jlg+cy+~#Fs`9$Szj<_*bnA3W}->Ze2sfKaUForzuG)}MmjsU z!9(!cXh9G*3J5F9M`Rx!QC{UVn1UVj=5{Zyg-&OdnE6Z}pVMna!LC*LDt-I{TNC=6 zrtT>HUAb>8aJm$GYy%XZI!5PXXyWHSHbPDsI6rBaNKXq=HZ$-Wi>beCq#;(MX&?{$}ii5>kn_0YY}SQ zejZK`kr4!4-;eMUI%T4*#5{u)bf6!)HRL=jqs8H)o#~!#VQ@K{Tc4+m_Ad6mn4$3O zJM-dqs+V#wEt!r?lV*HCX<7}YK5dmBd8))g*RerUvb>S#mGqPzvv(KaP#sR5YqA-Ve5RD9>bgM1|^qheg~ELTcB*Y~6?=&v4l)1K%+cYJNj z(b9R4z%jjzxirA-_V4k|P+VO%sz4)d0N2*p5k!dFR9zFu!hYPB1RJAoI5UmCxR%rysSsgS1N?JFY5MuWj$Z z7T^$AcFF`<7%GlW3uUgV?8BtJ8-^b>;ah8we9@HEQyJxOV(Z#3DxBy0Zx*wG(k5L* z&7kgBexiPOSdP_2J3TSu{uWVbekf?;z%N{$mt8+e18kTA{WtUhd(`s?(xKNZgjkbNsY*B*WHwn#kK;N=F(o zaV;;udR(T-meWM+iHZ}6BoQ}uuLmMuJ`25oM8WVkQ)xh>(J}Y;JI9Sg?h8nb1~f>YboS@#v>WZ-S_(4i*ywjub2PPg zfChD&nB;p)>tAH7WH+p2*HpIAtF~(BsZ-6T9F96jx7IQ>2U4xNj-;ft)?zTt;#N5<9OipVzvvcZXp~2f_<%YHedtXduSGPw#5|;cl0fo<}0G z6JJIaAbyNNpBS-iYl)1@Xr_7_Y`okAKo22QBRg1jUwX0bZEM_G_?_R^~^AM-+>6Z`l< z>R!ViHnhV3>FCt|NMY)iF{z`%^>y|V&C@(6uejdw4t z1YYktk#jPelk3dz>%E|Npu%< zlYmExt2TGZpARGCf63D1I~gJd+YKyh|(3}n9B$a?3e(Ai^)7j^nT+g!|@QlLF{ zq@LG)w(g4L{<^^^nK_LlDwk@$zH7Dm@4tlp<$rq5=eW5Wx8OiwKYF_tunAv0h`i{STgoo|5rFbDD)PPB6`d9t+uLlnwTalmD3x6GY*6gm~|3eP& zzp&~5EquPJ5f5LF!ii!j5fycJ!jxuclMK+I`)KLfdP&RRNl_L1Z$jt zf9a!wZM~tq%8&(@#G7C023L4Q*Fh+Lm-wT8X;!)%KvN3jw0K7A9ynq5ZMOQLEBLUd zqb%MHk9croBy+k#vB$rxJSlh7@gAYo`{bH^7^AFdiglQQuS-k!mUkt}H{}I1E$Bda1LO#dRgVx^|{S}Z}3 z`_uvHdck9S51t=C%vB4?)+mo8e^|wELnFmWhZzR?F-W~d+KaOZ7dJFH@^3fU0eUI5UjtlKae5I+^Sw4&I~k> z^Z&{pSUZVVLBr(N;~?F$8#T$(+Pb(ij^dGNAl^$UbsQg@QPpPeR zv3Tu~`Rt2c_%@D%(}d=$M|F#-V1$z6K!S*y^js}X(!S<(+U<7$ji-hyy5viS6~zOl zcugrZ@%vP-Y`5Vgto4rNO|zT~EUUKQESO>Xfv_41%MBd)I$mwzU8uApHbMLfZEWcY z2_@^nD=0}HpmrVjcA#Bn$h~2P<_CCJ#}5Oa4<0G3D9$sbzrs*oGtBQ+`E z%&+7buB~}T%8F4i8;b+8!-0({u?`MfmBzBY*ZgZI3H@6GvNc;)lk?+4)&O2iC`)zi=hTUH2yYte7G$BX0J~)@cciH#gJxIHvltq+&lS_}Z^&s}s z76wT(2A@F~)(uri^lLdki6Lbw_^rB~N`AyI(vYa#sl9z`U_2=HfO~!oz@=j)ZE&`H zHFs&S(4n?rGro~)>u%#1F{^B*ln4&iSPY!xqG#m7mmC5A;XP4yb~T~`Z!(K!ce_<4;J7*)?ix-a`m2*0ufBUpvBS}i&;qBA;Q&VoDS+9c^+ioZXqwZ3^i22%MsPnA}tV?0S7^*i;o_|T0gIe28jB2os+qCT0G5~r5t@sNW ztrj#l*EQ6ntK%;7*<3n~$Q?8skWNc-@s7swg^4a{fWwBK-r%8c?=5vIi$|hq-NvuEV9%h^Tt_~2JwRfSn7pLZ7?d=x~ zyvnahUzqe=_n8O!!RnGAaQs#@y*bA-`~0Amk;k-y=;z(AypUoCn*Q4c@HhUB@( z^tykA$rbs^c!=pMyZ?Opj-YnMuCOYxL9MVj9s)I{1-8UgXo^>X4g8uawla#VGfE+* z;Q9|0Z(EN2Qe-IXV^oO8=z+LN1sW)dq9uA{`$ZnU=~`ep(rxjR1E;5R%AAqFq)`s< zDXPBJ<)TBZgO6o1$!`Suw#WmnG@Z)ny$TL^K5DRy*AYa6&DLZUO#CF6%5Fw7vku3c z&BytEkrOj|ammT~xMARYsHj}xZx(4)bdO2-;)LC`5u=UT?+Yq)?Z7H0gVSe zecpc`9qJakB^_N|x2M%3KRO4-a#19q<@$Hml)jXtQN5QtsB1#fr0BL#K_i+$qOv_G zabPi92ESUaWm|l2(!r)MGmt&M6wiBV#Me0E&Kob0QOz#cdHFe~yKz&I`v8)pQJ4MK z2jS8zFomhTLq-$ zksD{@`~&*O`A@xjRQ&1K{`U90-T>##1Qn_zktt-)obX_5473)*RQCNDUVqDv{54PV z|NoypAi%PB^#-DVM-VrVB|x>v_H!-q63=4ut!@1uWsl_K&(^GorHEPiG|shi^(psp zFnfsXT9UjmsmAocDF+2D_wwxX_NGTiI4h*R$K~2OwWx@%ieeL4ltpMmZescZ6ww%b zWMXw5iAGvPHtxUH5r8b|2n@N=gQh$%97kHYYop8G|A5)SPDs_qY8oi4JhM3D*?Cv) zN{5F7P=D>P+98cR*GuAH>ml2lAV0NE9~;Plne#pFnJxPYQI_D3`th?JWZ|H9b_IaM zu7~XJUnk-Vh(6(ffdIht!nhVPtN`xhxkyrpCZS$_1-4P9BYmY@POR&VQmN;=_m56i z&P+WVaZKLRMCktE?|EyPJeYzUv!TbCH9HuuTGsZ%@|qhOj$b%F>2j4KWmIryCLLF< z6s5lW;e9os04+y+>IbdAz1`jrmRB1u5m*(w91J#F+0$pP7!VOi1kgyiv8in-H9N4cK@t)cPOdV-oc z3(f*$fJ=I1Riz}xPgcN|VdRyLYN~MI54AdLr5F3eS~17k|HKZ9=E?i0!NCo>awaPb zc0X!tWK1QN&*^G1F7|S9@f8*pmO}-XJf#lB0ugmph6Y`-Y*8a49C2t0d)!mwbI1Nj z+&h8|I9QEmf9Zd*_ufHGe(#$wRzws;X@Y=KM1_!#(v*^@NEZ=7st}bLVu&CmKp-m8 z1q1{X1e78*gdS=lU79ojsUbmnOQ?Z__&(p=cXz(Ooqe~=y!+QWOoo|%Q*mYL4jHI34)ZSGu2p-9|BdDbO>FAe;WWBUAN?*7T=_;?-C{t5eIXurR9T)(gqc z@SqZzp!hfsM0rL+0&WJCgt|4sR^2Tk<{a{6UcrEWKBd(>!l=^t7p5@S#_Of%z`oE> zTF3PA%Qkk*hNLRM;mr4YW$Q4>+wZ1fb+rCsQVygVfslEt7HM;#He^y)H%=f)wE0(arZqP@C;CE(;xV(&}Tg|mUXJtHnPO* zKELC~4^tkNXCLtZUQt&e(is(h^VBTcq&R5sX;fM{^B67 z5_dIXF*mPN1o`X*K=Xv~qiv>0!SPx&pOA81uK8HK{;%no@ZMk6nI)b7M94fmoTlHV zoKA@u+4u(GV#MSz+H`2pH%c)=2-k(&r9^AxdSZ59QOD}VsynCk&c^DGx5?bnJmSr7 zx)fN;on0gD@5PkCTp`k)1&XCmlA0Fkpgb9)&3-lv@9x6>-O`i2Z`z9Qdc3%(>fu|I zt`As{PaNFtpKZI@_pC^X?g?^y-Fj*Am#v;m^{<9oFg0z9c!_}bg6(H}Q2&R`GxmLrGLQjy=;P zaI-7r;q8LjYRUsuH}@dVydPOV&limdxaxd5H@m4@Y+}a=#Bm5;IXyc43u6G#$1vud zK%_nHFY+8i1l;Ri)a%Pai%s@(Mx#KsjmwUXQMBDh5;p;q%bMZ%pAf)w#(99XFzw|< z+qzV7c57HCIWWqWPmAwfsYj@h*np8=j`OU~dkX;0J5fRFWQX^>g3@mN3&r~v@b=&S zlkk#BlSdM)Yr{?8Jz2h+)9sstHUQS=_=U7@N>;!5469Z_H zJ1htQYXEkL4}k&LAyB(~&27w9y(u;$0HFR&Ak4Kt92qaBj9I6w0kjf8460|DC1#cWL|$EPuoDe|J~=yEOhTjlWCde`jeF5&mMO4de+Rcp-ZZeD~uqDkx-*b8!dCZyKZgtfvR#p2I#Aadqp>7w}wGCJF|!NDPWET z@go5yo3+Z7N!G;p2c-A4`Z5#EAI4Xg4bO4pXI{T^NBekwzWP6wVfW(;WPFf9x&dN) zqZ4X-=g`XiZ}Gyv-ep;w!SXpfnN}Rc?33S?N0LsVwH_81lj04D^<~MEliOF5%42);J2>;> zj_sKzJfMq50bR}#p=;$EFMJ)qxZeb(_9x=Q7UGs1QI#!p_Pb`-(c)I~ZTgIRsX@Ek zO2I_s4$6liO1X4}ht*xVI##&{iZgVmgoUpsy~LPK;eP>{^JwZ$M4-?Z+K}oKTnF;a zHOd>g9WNV2eR;@T$vUEl85%nst!r3ZmiO+4&e<13EV=!BmQ^nL^|l0G18P5R^yClR zKK{iN4%JxW^h!gWH`eP#?!kDS#oyuao#$`!H3n% zt|A&+uOlB<-%mR5;K%5=qFzM4-Oc6Cr?P!iW~~A{-t|mrnl`EJL}PrZ-Ml9KDY_A3 zrq$(I_FNn&eb5Ruc1G0n^u7LKxpy92HaGO!2?{*}D}FZ1^=f3ZavuSN3>n5sTHi^m z867KY1-gISdd|9t^U2E8Q2S-K=fFnkIGen`6;C{qRqJ`ucdPSdw)<61?1of1V6Xyg%UnO^HuIB;gU@VpY5vG(kgw;q1Y35u>UM= zUs!uN&WB=uRm(Df@6x zdnLPz?i#P16`DB75%+~)TD+3b)$DfP{<`$O>D1b3-D&W#1?xJKx~s00&nKnb0v)VQ zUFQ^QMZB%t|G4SmNj^8$o`)b9(K(Gn?+wbPp(pQ`jO@SDivkEAg$}zPiXT6k<8Zgd z2W#Bm1>{}6E~s?NjlsQ3teX4Z*~Tdy%MRmk27UrAV$_@3N^kIR3 zVTG|pMgjO(vJr5+JoxwZnMc-=p{(2%p{!}DNB0S5UEAx*bN4@*mUykn{o0*(;a6WE zubQt@qwe(|ICi~9nU{41d|6-!c3<5C5)|ztQGzJp8+-{N0!TW)A;1WM9KJlvK&-TYVRR zUL5dN9(AH-d+UP!Z0Bj!WA@RI1IJZ9Q*;3F{k=;Z84>^OYlA#GKyA&WNo?COLZ^|E zqH<9djdo(h!DVWlj=!(WYtC4~n}vn<6V4uZ)pq;J6*IG`IZ#}+XI@9v;PurB;zn;S z{vG5%;3%0-w+(xkVL|1|rlkO)_|cflUQ}-q?Y*kXX@(VOIz&gcujj>cqTz0>pe>y> z--HraPCo&4!v{l!z2um-ye-9L3wxf)o%&dNudhun9*6S!qd;Bm+1;gaiW>6tsLl-! z#lq?K%#B^Ea`v3iO!oG*H}8k9jTyIGBr-FH=JDMBs-zu}Wq|0RZGI9V0BxNMrA#`b z`@%hI%rL`MkTm+wSlg_jhLQX+JWsDd2T%Jyw-rxmd-=((_|P_*?T*uU!wOkib1p#L z5Nvvn*3$M%!Gd|uC)!FtQr#{McP{2{F~o=cJ6Z^4n0#H*3|3a8}Uf@Q8L40XH}p$1A~ z8y>I)(56$vQJ5zgWCY|gf##i{k}ueK>t`a_;*Ptp3PFe5o(5LRFYi-q6Gr~%_?#Si zJ$GZXf(^?vIN6z0)Yi9YkrvnkD^Cn&-VW?ZR&6#kd65NIOH2b7S(sL)buM4O*&o8Z z<9;=YXW6h^DtA09?`UzI7}Jj>OK%Ip+@h)*3j`!JO$wLAm=Fh*H5E4QmZ_8qTCQG5 zp1sshm34TG=kl_D$tCTyD9d%-(2KOY=ew+Z!(T72$(gY0x9jGAM1X1soM2u@u*B%l zR~oidIjmsk(ex=pzI;Rp%!RP6Iwbh!G~sB!%V`SrnOIS5X(bO=(UQ&nXVwKukC-LH zb;DxatKL|vnl4j87;!>fYaGe2`Z|6pcho6nM*7A?J)rn)7!l+`6HpdHYg@mD_=H@pzR1dk*h&rO1=OYA>QwT%MT zR8Jji2YWAC+ZcZUJwW6wdzu3RbN-na6PnXUDwt={+wVtc;aK)W!b;(5J-4S%5*(#i z(IFKJv<(@k5kymdVCLL4b z+9PEenUg1cT}&?j-(Uc_?+_Ubb_gTJbYy`UK1k$ONaQq9)aEBDCs9?^MBf3V@#PHX zasJ)4T91wA09G z+RGKW8vW1hBPvgd4!nOq`SQx=(c_vItJ*|Q=Gv$Sy2_wg*4U=8je%jK1_WU% z`ws^wKv~lxf}V2X{GU5r`(7e_P#rn^Y(6DsWawsAX?%i` z?_QunC*6Fj7fK; z6Mv1^U4^G{+;0*`1QK1=4?O!U%{K?AhZaE~6YO6irt?Ie@)$}ROL8==CXKtkVjW|G zx3;nSHCD0seYpVXq1~w==y9O>RAD!N*@0(r`|p9WX2+1hnW`k|(U^E%@~1iw$!+E|7w=&DJ+gw7FRpvGaMa zORdS_W!)-W7xp{t3UIIOmaAdO>otA&r2wSA;^;g+=?{ksjBy1(>SJDcpmmq&OT>$z z&BSZz4TNh8dn#Yf4_vJDj$R2fbiP*SR%O>tC`xn+V5;~Ql8Jtyw?&w&d>UmeBafTJ_aaGuyzw_ zmyjRs(URl|fC@fI!6I-J<{4nG?aSLr8f&9Km$z_Z@X5<`R~z!9Unecviw9P>bSg__ z53XWMTfnhy&)t=3rLIF&kJhB0VE5S_r{^@QV~;P0>=)(WY#DwM0%L8iG4iS+kNQ7k z)aWx)JkV+X>2Z*8&oHeP>y^4M@z||nt&?5~^M`{Uj<2}jPgVolc`Qa4;N}M_ z8#4@N5{KK=yk-kN%Zn&{QEtVd;F2xRp9Y`5R#u?KUPFCok}(UGh7Dm+K(C>i1i%zJ z!B}899_b0yaJ*5`JK5T}xV>Ta(U3DjPVcFIV(jsMJl6?2W^3FL;>pDA-N~V?8pQC; zCDalEnAqZN+GykD8_Y-gKu#E8(Wjp=KA*-%uV@P`x53?zhEWEkjZKgOh`s{F zXGCvaI(CB!0TPgg2N=;TbS?F03>|FOuc7KrA&F0-ZH)bG)#xjf3S$}M3^Daitj$?* zT@&eUemAof2{{wd4wHyUlLvtl0kIL+I3iyr0*+1> zz=&#C6GA~mK}jR8!o^XqmDKmNFidB-Lma|0dcbxx7mVonD3ii_0@YA8bTKowDaWmQJHx^?|-Pu@%FhrC|B zl+BS9)=)_aI>?n@#;aJ8QCIMdT-nnE2)4@)dPa#hlUxG?MGGQRb_R+l{dc_!PtRoj zlnzH;T>n5}RokF$w4StgA=6CKgZ$P)ScU~n)CH+xT3VEB7=Cg|hg1Pq`o2Nj8=3G45G|d>uVu5JL$crJZG~~w ziRJ-Xy$_ug1&217Rh!u3A3p16S5>F!QJ4=TzrU6t^o%;FKO8&_e_Eh2y5h7iir6sn zi(o`0v9g>T;hW@VJ zwgIWgk4Ne*E6l&a&n7xNJ#&b^x?*OX)NTb^L#&CFlRD>$(u0^W1()!g`=6jF)JcpB zEi_1{_Arr;o)f7YW!wa%#eA2p{bzCX@%zcK@IDpf!{7w2P)scN7a)A z;{u~P)K>C`@9URORFWwEto;%AZ>Gdu6WAqS-1;#xQ>Yk;)1Fw-#pf=Q{ItQ4@Z9+_ ztuKq$79ZU5ciHGs9={YLZK8uIwxxzYoje>WZIOqu+^=Etvjoa}XNEPb_|s`|mRQ$? z61a+xb=Qu$CFja(x$D{PsMkEYL z+GX00s0?&sr#X?H7_J(&2`ed#zm*H8Yv@ZaUCRsO0BqYA_+~WMU1P;q-y)SWm(fUuVc zN2mxRsk_Mock8PhDpKy1SE&m1oJx3Xtmbn|>Gh@J&(7zQw@sKfEOiy=DK{)dIvB|o zyh*T=jm0@X9C(5-@s4|Ky7RqPY_WIkw&|}(sI@oHQ@Dz}WsD;ViTVsj71?&@TTZ}Dh@Wy!25{SI3s+Te#z0Qf z*GZ~U%B8g%SQ&p(!+ztE;|ya2-yN_$zI{b5m?6Q`Wf*Qp4xozaF8ky^($3LoTjghW@Y_^1ws<2hhV|e+<)a-mhNZcOd6}Erg$SvsXhm zZxmgs&TY(PS(H~G=G3^P6W8A!#9zCKsH_}$Cp>oUYJRdTi~kHorJmN$Z0SSC zZIiW%t3!Ia>U+IJUyPzjTCr#E=ZGu2-lU6q%9xEoqA2JJr;NuG4HP_ zFl8sdU&YcaLSw)AkzUv7Q-%jvTGL?~GI_csHyNjTZ^^yZpg5dc6vU z5+4i0!OtdjXAP6R+@>Z{>+tJFIXfP;+a(K|7V+pa|AIXL&6(Q_Q*F9~g`g48I75LHqB>Y5)QeP2E+{%b=CtJ8}#hR?@un+zz9$~cqer)$5 zn+b|3n8!!d=`tT-0&4e*m2&zpVsMEjKxwi<(@R6hAMu-!X;)rZUqV0(#*Ube{*FvU z%l|&)altD2veUt_Mb!U#{>q{D-!f(Xf9KY3@Q&G2W=rTVwlx`FC@5ZFIk_MvC25tJ zUl)dmuG-s?;m9n#0sjQ;&VSB6<>-;N$mz2h{F;AjAhFwKO)2x?K%PkKgc5st3JS;w z(2U4u7(xx(d%TTM9ymm#H&C9zGu;3^RgSxXD@1x0%$7bu;N)F9iWcPbPq5ABVx`TG z&&VkPd-3Z4bkH>P8~HQ?Wy3&B#~eYgV2Fy5+*yP>}R7?UiY%Z z_LJGy;SjF@d(BP-X12YW)HB|9yMo?xt?#(^&k zY%`fBPzW7E+20P&U>LU$WlD!yQwzHaV*%UHZj^MtR!+X)gLwToNAN)3#$(_jJ<_OwC5m3 zeL{|gdQl3x9}GRo6-M3>Jwa1%&QloxM31Y+4}6!9vkYv3ovkyXy=bvJ+qct8}?!=0=FuqSj4*v6ff!ZGq z_y}vD#(QyK&~Cv~hkv1!-~JXx&NyD*dS4+SwQzJI{TR!8KLH{aIPOC}qhXwnR*(qZ zNltI}+-MDhp%5uakE-uMzfLSe1TK|OES;gGS09-ClvxNeIO}sQ3{}p|E07gfB!)Zzm z)rs#VRMaKu1SpN(MXqv-qRMa2c_co6%N;d+{sYMc_7cX&z7p6c&c|qSrwvn&v{VcS zJJVWm6fh6Q05})(t#Ta&p|>?wLLL#5_{;2vZ$;MTGu~(YUTwV6@x7ZK9VHvCaUp-9@0v7W`-P1ZbmGCh z*_oAY-_SQK(dn2G!nPvKIFzNF0~c4I8^2^hv*AfusvFuu=BZQ6LgW>I zet(xF)TA$C9rq`HaY{#UOJW`y|UoEoM1& z+KKVRFFC1E-S7_w4z*D@$9H;ZP#*Abd(Aq+j02WvM z?SU5YbHv26vJs%|%4A&D&gnC9S5k|cmo08Qdwfuja})!fN)>J3BP&E^!IN(<`jjqO zL^`@AQdbvEKRU-h7M*#~d86~{qLHf^PRhtDMwWa;6jN0HZ09(UwWwO2Y8;n4hE-Rd z@yZxyv~Abzzpt~(3mk>gO3T-dGuGOOBEuT8C`xBNv{B+z4lOY|y9f0`qf-i5R~|h( zsaD9x1JX!OE`4e72sVPlfSp)4uM%-DQP>;}sh;1Xt&l>4HMKN1uwSvQb>II{3y zsSQy!7tmT2=BL}xGeV&D08Xf!QO};Sl&P?juA{cm1<0@MUsLKI8;dmtd`YkqKzO{( z_j&m(pQtjp#uNArD_D19YArZTcBt+`(IQ~txJ42^>C+_6PoHLBO%MSvQ`LVatjJm~ z4py0nI9+y%@z7J{=zO&Ynup{zt&0 zt9i;pUvtSCNUtqo49D8_D~Erq1w7nD7@nMDBcat+0iC~B@&N_YgDj){Xk`))22AVj zX#&52o~gS)=b2DOtSH(HK%_<=!RA$mlene6=7CnaBK=X%+xh$RBMSu{XF_i*{bq2| z)tkr_&`9OAQ;g$nRJ%yJEX6i7WNW0b!Un+&U6H$vzq`Gzt0+%28o0+NuJkyi{qbif zVrNE1dx2)frp4P#hgxfxy4++#*BW+-`W7FshGrG5RIdX<>}NVD#?3Kr_Szx}l?%Ya zKxXD?K6Ia9+(ZNqKijIe*GoA3nsRz}RG-wly~};tP$RllboB)2Ibc+fwETk3Nky#) z&{sR}_H(i%$znutYjhyFITCwN`?H;O3z}%#b)|QX*-WSHMH0ZPlg3;N$|8k^YL9%?p}#% zteSTFR$0V*qZ2IVzQAwRrgsUHKswec@!sOc?{(&CfGr?tX8`+@-qF@f2C%}FLsW8z zgc$oY+^?-^?G!piRyt94>*d3B|K4d0ogU4i?uJIA{m zN>9>b2vLlzv4_TBXi_6q988pY#PDvjDL?v$!#J5bA5%7NupqcnR&d~QRG8^I-1^XP zB=Ho+t+#?a8rY=6xCO_Jvb>2QVp0#%o4xt89>=*<4n64h6@`94Qr#~ee)8KRCakfv zHeg2ExGbj3Gre&bREImZx0u&PuPB1}tUN@Or>@O)-w5Q%B5{v1k{V!T=v@SLhJID zpy4&RsYqE{_W;E80Xz!4;`mC);VldUzdRv6#nE)?9uKs!XGEPkx4beUXxL@giiF10DL$dt{NXqUn6lEv zg0mC92rPr0LAX!w^PUauE}z5}-&&q2Kd4AayS|@7RJOx__A?>E3qZSkVNNMyZd%ja zpZAO*enh4+$=*k-PG@~lZ1(9xzGUSn+xGis?Z1KcxU`M0^l82qQ1^-WO~(JS{#wH- ze~~bb$E#}o;izB35C>SiJw1^UoKNWzEFgJhz$p`kMniR==}wSrGaE_>p*SSAvu<40 zvvz>hZdVgq_)d6ee+uH{6oR_oe+5x}n%B;;e-h92Z{9HyntoUrSuh_c#r_pBIm(Sw z6BZ~`kg{MtIBR##H}=c@=zCK-ax3Y_G4iywms{P=roF5ipXW!D!d4-}1-8Ey=r%q1 zM!<2y+4FusiFr>Sx9GQSRS#2UjVEJ6^fCxb!5eE7D9&xY z$kue8Iz%oyXB*8(@hMsq5vvSWNvDD%@kMs7#Q~+4e@C=Zk+&UQ-lm67$4*W}X1u+q zFHUA2M5|M=uLI99UJp=}hr?vb)jA2mIE;So=%V$0$j2ESLAj-ZM@>pZm-ivOQimN* zO7XSTnLO-96h+w#*euKuiKwb-Pdfq&v}?0!8@oGCF5zu47IPt;(cl95M}UaI1rb5_ zH2Mt&{|Kf+&sZ&~StEo%HC*Li+%_KB*eEKkT>IXacRH&0P%^H7-E4Q65evvcXq6{T z%{7aOQf)BT2Q|Ric8o}Bd75;2ZB{VFjIQBjCFTv7UfD+h5>qH8Gu z_h`n)Y(xuUbGXqFM<5-5FWEmog$a{`HcZUONJq0{Sd#QlX;g7Y$f$5JUm3*di`X zqCCsOfvBJj9;=8UOc|fFNx-|MAW;*u?=>|Cp_&u76v? z4yotiu1YOpvaK<5Tb_|Z6V+v?P8&hBOBqqqs1uAOTYCCiPuvx@C~rk~;n;YK>aUp^ z2OXOiO0F8MT9SYeZ_5kf`TZE8B07F5rn&z1QiF;+qiARPYdT%-ZZC4H)`IKh7H2!HYq)(&cT z)DYnbxeR!D({sM1m#em}opm7}!BV6MjJ(hD=H`6*R3nv18@b$PPSIx7z|SL_RHsnOWpqTZBl^Tru{qQ6V+p*A<70hd`c7xGOvN6bT=`(x-LH z-jAPKL0!H3co&%lna_BG3$!C8WW@oonbq#M0A8}J1;j98*Vh3(4ppWE%W^-OrAsqz zwsw9~NW3sUZ_OFz3FCv0+LfT5Duq|r8i2ILWvefhjx~1lrd9=fzLps@=lo{bdj z*j~T)s5@W{_v_wzZQ{zrF401Czb8AEb^#FcyTVdnl#}>2Q3so-aEvlNwAt=rP*Y_7 z$e3{f+&cTMcjNk`zVxZA@04_hZqZYZPG`MkLYG0b+BUWzV}iccl6Hd@mGkBdl{0Kv z8FX=y{NOdi5G9y$_(*D|hThp@&pVDzjTsh+5#MKgO|N=yp_j{wd(GT$*&)fD*~#wO zBY;ypuuOjz1k1%>P7SxFp8#}$vjOumT3TmwvQHbxKplC;GNoBig=XEnr(E$qNhnr* zm*gDP{6_xC%+n%`;%wWhKOE8IGW<4pWB3 zFC>A8yBb%ZVeIZ=2mme;e6K!huk>#e1?3?|ZLpH_Kn;{o~EV;tV&=ztD z+7A%fs*CmxNKtg<9>it=f>~Ix26jSeXT||LO<;DkKmb7J*T%v zgU@jQxv~>We*c{`oioTQuMWk8vToN8<21y)C=2K5ye%UYW5_288VOZp16{)#zI+{$ zC!}MMBX`}SA?y1S+LszGRqYrBtQ!B}Kq1O(a&Qan)t9~D`yFzBIG)+TSM2A2h&J5x zha;TCvxyh7M)xv`1{$HrmuPhZx*Hx|hz>HW20T<=*&41zchC z?KVHvRITqU2?d~h(4f^c!kwlGeTBH$-E0}8ntu;UYGb-ZqS%f^AEpRJxP4~8rlCiG zuWP_iFb=&zea4n247H?XAN{=NQD$em>s_i@47)m{vOjwlJz<-(Vhdm|q?O<);{%Kc zSg@Tmt*C%D)?xw}+dk3oIhvY>7+Q07N8Tv^sllv?Gt=Yazmp#%Va zsY@Lo{peV7tE@#v>IUMJSOeyS&P?);b}7o-{@bm{R8dM%9wj{_?FtRY!U~u72&2<@ zPcwd^gUtL5{mp=0>2#x^Ydw7F3Po6vL3!sn5Y^&~}>EL|Gd2L1Nid{goY~0TvwG!i_U*y5BXV zII8RKm|ef5bZ_CRxJgBnJrCm<2TxB>;4#*7`q3yRh+%5MxRrzUxyO*8sskQsedv%@ zs@w;V=UAk6ZCN<1Yh7ACJE5^F`{%J=N4=Qq5!os^KX&RISmUDbg~c_5oZkh5%Pnmk zCUyn}-H5#j@?aV&A0td6?iX!B!w_CBO5Nawy_iskO7*&(g(TQUNuynHrVbU`uWMgrh7?O%e^-H*w;Go-kB0qNEcx0p>B zIvQLzu_&4Pl<5=^`w5U@Yp*kYF?CV}cD5;F44v%KMHt%CYL`jPR1~x{`Q4;na{SA? zeBZa(jXJ`ToBxMh*3JPAh>l*_uImjVHy>zL$>(U~LWo<&Y(cR#);e(XA=aWu#H+{So)AQI)>T1D0=3gx3A7Zqp#^HEeIbHx> zw&3G2qL$t4ZV)Wo+Ew`FrR?Fnv&#t8z>v+X%zxHnCmTP)c*M7e)Bv z?3&}PY)89uxx=zL@AyEIX0>N|FL4RpT1*nxWZ9F;vPN} zrGTqQy-k%+@2ISd-czXvKVvWGn`1wBgqvB~^F?;q`RfwoEZwoPewtSV9sh_PP@jrx z3j<4+yk_^S->^ey2ZL}=D_|Gl((eUdMJmQ#@I0RJgT%Z{%)wtQ11h?7Kwm?738!{a z)#hh;&vFAe{IuAlS<=-`XX(pV*4}Y}GG7%leh3__Ca2N*;Jd!$J3C}jJ?=Z4NTTiS z`ViP-*;S+va)m@RQ4M28D@w?)LfbUe-1|Zi#hyX%2AtS?yImGJSDDr!7WYk9HR-(m zhm+Zdgvs75vk)%!(pncchHEt^mS^P@>DFHlG!fl6gVo|dbCaCI3=oihYf$eP%uTWiZ1_(625B&}Xg85HQBle7ZGBk>X5(@rYowhpMhyum-KO23})e#K}o_jAgk|^b2|0wu4 z-@{wR_v}z>usP!VYp~RT@yjK#(BaE>AJH_lu;;8EE!J+GGNi zcX^-Fasc42HH-5PdOF)ee47=GP}^0HC!e02O}xJ-sB-vV2jcb+8=%L(uVMsc8s;uq z3k7@HJ{1!mTDwyd*Jkol{MdBjW)p<@7 zEIuS1&;H^U-;RG=gaQ;onz(sir*msSDPmV_I-^vZyiY9*_1o5zuj`5gKbU!FR>^t& zzGgvI(c3#e7dsG6ztmwjGa}?J{O=hc-S_1`u#WvE5n>M1+R1I2JZ&wo zuMeJU;tb#E>Yr=Qv8&3w7nuQ{PB#%J92)eUZF`YVtMAs{rB%M7*=#4j-e^ka1^9f5 zaFKEO+5QPE<;l2?Wb$PZ8{LmGo|bbTM28JCiMLCN6%)2doPnM0E=0xtt)5fpvDQhA z8X7H$$)o9Kb3r_!zqLf{6~s3i^CV5Rx@wt&#H)`eCtjVOsA%tlrv6ghMp;s8mM#53rN10(n5o|BqjMA z+z;z+{78+~=DF8hYj#A&7)~?Yc48oN*uVT%>nXgUL{1ux(1cL#rKD^3BA-cD{VbyD zuNo?u9B19i9@oM%GA;V+uEn)ca6%7~Ck-V|WJTP^ zmFPY2X?AG-O0|0H9+zu&n)eBfMDYr3jFNp&QIf`9UL37hEqkH)J44^jKX20MEC)w; zmE?)%VL{Sw)4~lDGCv6<)jN*vT^d73|7N9_E6xOV-@dov@s(NOJ+rt!I3JxDaj7u! ze?~pj;AJ0IrLAZ6+`f7jTYgx>d8N9-EvoEqOn$C=$#kp$1EtlOT; z7uexF1&>?uJGt^nb=dFG@NN#OoU#wRn5C0+Lv_RQIA?0D9=(VVYJbv zEoiMdpm9NwpGPNsL00At4{miNnsd;{MUu%4P-jMr4;8u+fMgGUZOXwq)(?69!c;rP z_vpDdug$JF_sE^ImjFxY7wJ}#k~T?{H$`9Pe&cq4PLAsJ?24_PV>`go1gs^w zKSHE!<^?xWAFghNA6Z)PE?V$8@@hQY`5RjTm;zgL3rZ@03B?>? zsr=jukh@o33lXSg+E$VrL5pKaiA&3sWy&4@JZC>W6CRtWE6)r}{RwsI;X#P=(!YkH zz2qW!#axZ^$xr8-K;_Ht^2<-_2uxpk9MxB9^~l-h?h~89?;>71p>MF@j|1 zsmEpysOaH)jlE8-+38L}54jLQG*Uxkfbmou##IGiUMuyK{HhbV6PcsG&M~ufmb%{b zyf$NU`s~;@d^w4dR;LVrKtv{0Xar#Wzlj5kd__dtX9E|n`>Z;fwI0TY4W{Q=+S^4w zdIT#mgGiLo3Il}zch*Ag0(Tt) zsu|y|w5`^?uPuU0PsHM=c~x4DfW4SWJWB|ier44&y<~F*yj(k_6c0;p5fZFXuFr1G zQ@a~C(Q|zFLDHEk9q07k=5(7}G+RATW*2J-I|(_rfaTKQ`pWRpj&|9cPyj1{)tuAd zTrz3;(jHd7eVm+}q|2_#;@pM?9lKQ(mK%@v8Sb%^fQ3;e|6eLw|D)e>(3WJc))I=X zL4V(L^x|T_@+agb2Lml_ZE89d-{IVH9BX6o6hO>*r9zM{KY1{&)232I2u0Vk z9=khvIz%+LGdNak>sVuRkwFXRNZ7}&eo3Vt#K`Qx_THTBnbneMyf`Did5;+BPGO?> zCdREsU9H7F{ZgpJ+E>0l=CE;4e|FdCg3I)HS4l=;n$mt^Pct_&Oc@p?4ymCPeK}|1 z3K8~wW^?antd!n}^t;iNZ*k9#-if9NbV78*noyTFeq2l2sa>hvEs#TQSN3*%x^QeA z$gni|1DUmJeTbq-sJj#k4Ak;N(fGC{kEKl4Yc9l7!}bc81jF0|CDl^uw+!pF(pM)5 zpNpU^01f3d6zF*nWNhotA?3^&25l(E%kA3@4A23mba{G!X36f9) zg`2!wlMI!P(a}46ZT4QyMJ=@^kd&-Onm>GBtb}@yVpVDU~ zX6YB9EOo;t3Sig_k*54S6y|!!C4@KSF^4fu3}5$b%r)%m%49;2Y(?S$c+Sl<_1ZD` zrPz>E`93#7^o=a@YKM=(KVO~7?~W;x>wxgHJKQlh08PQXMdEcjA=-)7l43dUz7%R4 zEq3ATVYj~Q1YEJNh%y%ubv41~h+O@SU%k(8ZjKzTg5^M78qf-PBO@#W+GL%!BL#lZ z<~P6C@L#)!{*6WXzkAJgE(rSDx)Lzw!_Mnesr`o&pt4>fU>z@TC-~oha&hdmZ>ol| z$C0$0OS=%4!ix#K-|Nqs1u;b0nRn7Rd55qx3wOY5#iGE(ps9KW>Jra>wb#Qwz6xbT zK>B3&4{fEh1jey+fyJLDkN^Jt@A&+kAAi@!-}vx1KK|Vw{_ciyJxE_<4l!TU zv>&fji@g0q!#BzM#f0`bi^TS7ZsTX`I)`K9e#LT2oP<6DhXQGDot1cG=3;UN%=Z6b z@4cg%{@!&#tbhn8NEcA5h=58{O0dyHM3mm5(g_fy1__CPfOG)?1p$>NQbG?s(gg&h zOAtarFM)&-0x8btJ8Ra={LY=T&Y5-Y%$j@0KjK=VN%m*6-*>;|dEUp?F!-^*HtU1o z(%noLWiz`g_={u=MVc$lVRGKSQqmP72T~?uX7z!9)GJuNncIwHk820OPi-3G;hfCE zFV^;*UmaHYS3EJ!{Jw@4s&wPuXFH=)!#j@N*ku1SmlNfn%_tXXJ5WQHohlQ%nespVIX!SjBkPIqX23F<(g4c!^LiMZ8O$-6 zOH&;Il-$lJ_eroh|JLz8?*K^tF!JaE1d}M{X|irY`Gl}|!a{=R3A|^SGGa?t>dR3V z?d7|==l91D8yau8L+Kn;bMlc65G(cAY~pFUS_8=()5hkhv}^1z_hY~_}gaGBlz{^k!s8uo1Nu%MjXs4Ag^5!<)8|+d5m)G-%9hCP)|sP~y@MMo2Qx@q;*2q z=ju+M8?ew!Gkt0BMT?sz3ow1!7*D(Q@Ze5+$*u#-uo`WNk%we~&g;7j;YCWYvS&|W zyJFGvS8qQ=)uC&#p+0;qru`3n%<>;{tZ(c3kKOGXbZb_BSUI{9=SY-frH?}k$YU=G z_H3>-E{8aig6E)7AZ-|!25p;ApE%IGx-U@Wo^J6BEU8y{sdiJ#P4T?5e@6v>+=rKw zN4YBNQQ=}DIjuRYO>()@wx#(4skzpZ{fATKIl2!^o|8}gb}oVI8lk~v6(!h}m&@vx zgIn|&dYga$YDN{r8_4}E^nfhz!J+>^ls|&<$DLBloP{th&Y6tN-iTRI$!Kn4m&%$mw}=ZX zQg)qhEhWuSQmHAyL8K8ovXsN=#5Rnu_3Y1e&+5Kcqi`jaT4nGgqtdm(sfqJgV@cSB zkCW#gaFkgEg1JDaJsMCQ+_4Rt(d)wlU_pD`&nv;s*5Wi_?5Nar?ET>hUYt?b&BLZ| z*ih=!+oYWCnG3PA6R+s@v$Xib6SHwc)Ao@55a;KP1b5=jTD->ZUwIF!mlM#O%8UZR zyPjqb_Ai~VfZ*18nGY9zcAh6nR83@1YKwS>8of4=vxbUbvxWY^s(AD?aRsDF9m0dL zMmI?XHH5^-nM=b{hK&>czp9Z(aPENt0Ts;E-};v_ntR{7WR-t?dM=OmL)dY5Wg|_g+YVD2Y)G$6zem0rRiAmX2pu^QqjQChlEB)we)w_0W^*yPf~14FBKjHi)Uy zJmD}hYMn+(Te2rXUjejW?GIP&a5JkJiIbkybq_xF#2+!%8UWA7-#X8>u$aguK1Mez z`t8;t*3wixQdSdOY#SsK*)~ZqMA@6AMhjaSv|l-NGtm5KT)B|yH|8%4_bxIfr%E7> zuKf`kg0UK_-ab}}$w&%9)!$dYZZ69GP`;#=%vG<{LF>P0!X4xw5@i&&inFJ|y(pRJ zH0QLDb?Quujg-kEZ%q?1_pOSme=j<4VqxzExGZD&Rpycp*S+>k0q4!>PDw9{8y>f8O3iqb0|E#qCc`E5~JXmVD$GFq!{F<3tutvD+p z{XA}-u20pT%fX&*IS;Dm29d=g_BW=Rf0ROeRV%xNdsp~f zu8QNF&CX0KH+@;j`yh$dUms~fH~Kc&j1{`<)XiUVgD-e$icOFLwPQupg>c+LygNkfoqiZ(}Vm#Dl?r?0Mt$!OjN*)9XaVP5uV5&#+ zt@TU`>Ju+5=9Q7sCOn{`(01F`({nyf1{T3PSl{O)gz{mzG7n>B(PSN9freC&RxnKi z*|v}f5P~=W0xXw_>jE|44-1X=(W_6XNP=S!&(?9R?;PO?&2nn<`%ICV#q^EUoi&db zO}bGsr5<(3(Kv5otl55vn^6@ESLpErH84If<)-1!sjRa&L7Y2}n|k(zp;CXt>Lujm z$)+Lsx12AnPLd3c#__P;CCY~}-ZhB=ck@5Cr=ACEAfuXYyL;mx0WHb&_6B{wSHBI8 zro2l^)=}+R(Q58au_r=pW`^y)_O34#B6Obt&AY4r8I1)tR9Qso+mTZS(L&!vNy^~s z$W|?khbWX%MlvpZy~`bqVTIdxt3ZsVZ@15!9?lDYKXt|1sVd>t4E#=a>NuCPqln+d2N`%5ihIXQ&bWYc7a!@x3eCU{_5Xcis&dK8C zt5#oLcJK@-Dqgy`auVG9=0ctMRYUpRRCF+Z5h=Grc!|hLTJ+Mh&4RRp4h=yS2T*Xz zn*SHb!;*lcyX3Ztk54@J@?-8KEzWx#5xH$NW11d1{~WkHG$C*jHX>wL<2J?S1D4b(hE~j1F>322&yslR`gcLKe_%6#?f<1 z@SN>Ec5lw7uszBwnLnLo59|AeRZPh>2DHIp+-Bo>qh93b(0F@a)|obB8+zzZ-%=t7*uL`gXuTlZrddpr~0Sadtu-S3sLV_WEO3KpJ#|dTIsG&gP`f z&|1Z$6T3mDGdoWkDuLvNpN58N1cc@`i7C7T#d~TwUPoJfx_vy|S7UIYSL})QKzE|FBkN!~bRu=- z>Gd)&tt2(-?)x|zXiI@D`G(`)YSp&H$%fLCL61Za9dq5G`;9~N@2 z4*p05fUgFs;9QCTv2lUL%hU=(3I)`F218u`u&@ds={||n8|%oWmMVba9E)LIzae7Q zGi&rk?D}b^1l^0v?H9k1Zi;p~A^(F<=Crqt#D1g(lW^@Ab}dN=&_p`nrlAz+Cn5^; z+DH5{1xrd5_UZ#B&v<;PcX7FPHchsu^eZr};9sBO-+ql(?MAy-{8xDa<|$3_{ml|! zpgds{pwyTGpfAxx`W{g8e;)XUrCaX5(a#0wY~z5bH?5sGfCHNQZ@&D&@iD&G zf3pB!(%8Q~$=?P2-J!o{=-+oTf3564a>IXj=dF1>HmQ}YhTc$ktAQiKvElvi$WV94u`YazQ`dkmC zmLgYyTF>_#lFTH@|e(Nj2H7qB29lC_Rg{~ch$6+d zjnA9}T>_*XPwI*_5@aFaY_-+d=1&fH9jEvmt9u4pq3psEiAgjTDl6sJ$WDFJos5eA>AlV#9V4Y~Tg{w%hyi4ce)}L6_id>I z$)#nRN3Al@-zrEOxFuDSA~dOTkKZ8YEYbf_!gCAeNyJYkH{;_(M|YbrhEhigX!OOG zIx={Ikrk`__-a>~tvPByA0A2m<4$#I)9#j+w#0f7#XnX zJhJ2)9u_!#SAOB5HxkjexgVlEn~42KvFkKb0XFyztA?Tso}p_4(+H*bb)2$w%0tT3 z(r|kTVb|3^S1_j3u7rno207tK7UGi|Gb3%4n2s!M+b(Q%hD%TFXf1=IEd7Zc6qMH82N|z)wjA4KkC0v z*YakMBWH{;66u2g;T|QBedZd&8#a`5hBNyUXB6nOdWr!mZqCzdRMmOuwVQWFBe-ORgs_#i5P?Vy6&Du2`L%9fz#zM z!l$0D4^q3n56~$&N(y^WL;0z)p53z#i!L;OTtI@1W~QX8YH-DtBSTu^zz6#?GMj?Q z4&uk$b@<_${$>k_)QJR?rGzZiYz)SwBc3SC8x^W|DuQ@#h-H4XAIg^%E+PDpNZyq0C&>sd@ z?-2({ITTg&$P~dOUS*TZ`0LWT1;T71ed6&m8%;MJ!}o_~%Z+pEOE)LiD48T~BAB$V z^@z0DsR54JQG3%i!S5Y-^Bl{&h4`1rvaPI+Uu)T}En+~-<8X6ot2=NQxETd-M?Ck- zadAQaMsD81h|l9n@aK?EMjuxwr1LQYY_5@dr&~5vv)H)0laewh&|kKA*Dg(^sVPEK z2T}+wIl?>zvm~%al?-p*uOgcl)TU#btMfQ==S!UJXOlvuugfKOpijK2U+IDp^)qX# z7Y%X2rHZA+7RDYIne%z+lknzEl>7@X*T4zg((I^02V|=?#s|-?Q1&G?ThcHPy{~FS(03Kkl`MUG#{_5|K zOd~W|(54s6#kWUh3RKUy7n!Y@3X)2kg`R=6TU3xXL)SsAbA8hn$5&*AcnmszC72$`t31k5sVChP(6C*7 zkl3ybZN;6?aY1o`Z6JM6K%J(2JP}g7Cg6PKaoWnlpjMUFxp$N_*Vdlvt*A&Ts%#Nl z4sfE!>EH>t5&xhF-HgO6ox|`bD^AcHKA!9o+R&=IXBhsos2CVi3QA?|)QQ5BquLch z(0~$}03fZ+$a{av1LHXSkboWb?mwFx`c?<_Wv4Vqn)RZIp+g+l+ugTt_48)mG}!*0 zy#ZRN^ohr?{l4S(iQMFx&DspN%^l0uL8`pc`X2cY3t>7jv`LaI(Pu?g{IaF%=nd$$ zKEb%Px}N%Nc%og7g?H=vG9-^t2%ts)g9WCrJKEps1xAl2R1=Rp9Ch3--JuefFfFip zG3X>QH`!FoJaX$3;?dyo=EpEsV}1(@+$2TEbwo&bK~M=9Rf+Jv923=U z3JV*wuMS>_F@QRO?w(^Dth;gwo;2)Hr6@&c$>vje+XHOWq6*v()RGF?YAFF~YAaAL zh%l+>tlQzw3Ex=d9yMI(JCmP`C-%u^k&2O&BWiT0&cGD4wrM|9<0P@atT+Cy!r5 zi{9PqgaaZTtPo_66crfs^dtnh)$X*~fJtV6R(1C`a2Sqsr)Hu_eHM3O1jZGe92bGRtZQp41yo$b5TJtWn>!(_r&x0>3!zguhbp&lq#LaIs@2kZpV{g2C z02KhHI=HSTyzxVr5eZmiosm%?v#C1L+?2wDfAR(K$*8(ADuPXb-f=xRETT^ z6eygvA9aVfC()fr2dl@yQbr2-^J*ehaf}K){}d9ttwx)e2LP`Sa$*+*IE?Fvfrfi~ z*q0-!gQvk(GgVF@pT~!TbKQ@KEvtp)GT+JWL5SkmL?(MXO&g}kWQ!_-*tmwl&qIq= zKu6FMo?7O7Z*QA7HeZ&iXtWPqyq0FcQC@M+@FfciSKMuiV$)1TzEsonT>lFCJ|p;v z{$6%(4)#mtmPNW!@dWsO)C#;sYUCq{JT}AZQAtJ9tef3OZC?j$Zo_{NC;zwSY7Pec zc{}6nx>gW?<(UH>Y=2Y>C)=Meyk*2W*@gI3p6P`l24#kNMeKz{N-o>_0v~(%kdZ4@4x(7Nmok`NV)XWuHzszQ$Dxo zV(-nPrvT3_4|p;O#8Ar###_v6G*gs}?ht)}YexuCSDoMu}mJfr!zb-y|cUY50 zzc+PHJdeUFrF9C1komhEIHS*z`& zy0wu>X0;irT(jy?ZO1ZMLr zJW`@mICwBgejU}PfB+{ZJ@kNSn0t7!k9tov>S5tP0CXTv;-;UG1!mDFPGXk`<2iE> zBXE>jN^=ziTaz`A;w2SxD>ad5+A$38e*8;B@kCttIUgJV@uTyNQ2ZHRn9|*||LDLN zRgO0qUhO6Qs(l~gHgQp- z4*7CYQa^SFys`C23!5a|JW~=N#PY0n_%dg6k7qUl3hb&Yb2OWW3JB=ZHK_9BT#7X* zrwtREcH)ws3EztFr2uvH zN750|zq_DmuRL1%Yin(mTQ+O9D@w`SOs<>g6y^!zTsIAq7`nDA?V8neH{%mZA3rrd zs&s0SSK@oJSy1~vRtM|rD ze|KM)78`wcDf5?KqW3lfOZTD{(=@2uCIBWfvl@E1FcdDy8d_Gi-UoxZ)vo3JDE-VX zqBf=V`DDH0*4Dl@1W1d*h*YTY1adY%r%Jyyb_#;zrk{@yRb<;KW*hHn!)lfqmu_g) z-1zb?QpL2IGrU0A8jpGyFL;|!1Tw=Fpcx9ag?cg-ik-E)2)UScc<#_@Q_Bq7oz&j36 z^=a%=%Ma=1WGG#thv1UsH^N8{4xyxQr#S({5jUr)L{$()xiM~bZab67rO9}WIEetJ zOaw+~nat)Kg&BQA$dr>6YO~xGqY8B$rRhpX*H@I@3jC1l)1TRYY>E2eiwBeJF(PXi z9^`BexWXGlaLw=vwj_;>jqQveOS1K9MhufMSY%TOw!r*R!2;);s=#*QRX4vcXp76) z!>{s6N74>L0D1^blZ2*B1Nx0sa}HIc&9>|-*xyCo@FVA7+^2C+h8JumO{ovWa zr(9gK;r`pl6CdEWfi3T2{s41;WOc%KQL#+XDEmlg)sHmN@98Pq?_4JgeNUPkb-sW5 z_#3ZtOM<9$kQSoNfem<<3$pbBQwpv}ed1W6PrhwKGH%H%p=KCA+<-LhHr3S%Cxq_nP2yF`am=V)9Gzv!9(&YTr~i!>3#Ni3wxWYkxJq|li%YFCWa zaz&<^?r?@aHJ`m=`P&HckX)=`YB_JS-`LzJZ);KG*%qI#$CtNWu-=p$SHR${7Ucg)mJxFWx0l}$_QENKQw;}T-~!{Ca<$~hFkbx9M)vD z#^=%nv3QzXjmZ8SIvM{lOiD@e^_)duThTRhskxVs?PAJFswEA~YzLF>2Irhu#&D?^ zTM}wx3H1WEfBH{99ax;8Njy25d@Whz*PPwm5&XSxOS0}qE$f-BU>*#_B+fyB3`zwA zDP&2kVNJ3!$rfGq{Ht;^2Z7HLPsTJvOniT;7-W7xDYQ|4i`QLE9ivdqN)e5dQ$!iG z%!g9Jfyy;kFQBK}*V3q9;>t(?Xrnoi;%I^JU4>wLU<_}kutic3g`-C|Vl4BfT4zYU zqGbGc?6=4xtVtijIbCj*#fEk@iR z0}BDhQm>k(_k7t68H^?nQ6&zzzbeQhH5ony>dO(}Qs`wsSBse|vGEJy{} z^kHve#rll~GmJYGv8uA(#f#>)!%fm9^}pdI$tfWecUjy#9Jtkw0i?>XH_TrfEvH(@ zidm)1K|m>1dZB&!#r^mqBj@6o5;3d5#`i+rEMm_No#-gMe&xe))C}=@-wKJxP)1cM zn{GMhN*Dd~hvhu@C2kWUKG216svg`wKJ|xXLYjH2EFzm-XR|+O`Llz9O||eE{~s0} z04VhF<>#0SEdL|FP#<*M6Mr=l)H{%|ys#;JAo0i?_p6Lf;s-VLG**M^iE6`fiC=H6 z?YzuBM7F+BGcXq|UOtdh4aj*?Fdo$pt^(6}djb8+?qk$avJwFt4Z1=%4LA0Z=(8C0 zEZynjl$acFvNaNUzR)%}l81PJ7>X zHtm|=_8>p9RjtVxuY~33Y`I3gLhiPr>bsM9$o@)a+;T|q){wOH^w3+){TM78|5n{X z!o;xzgLB8kZB1{F-?dMz9=ShmV{9neQEK zCem<=DVDTM`X0#UpE>bI7skt4v>{+#CI{Vm9KiwW>rCntkLFPj#@P<-&YfyAR(QK$ zc3&V-)$XioTowPC-yfD=8Q|ABRje@x5xN_Bpn@FE^D!IE%E{auZ>?Macm>y1{vLzD7G@AMD-;%c8RmWO$vV@T`I9wQyy6D9S27P;b5U} zhEfmL`TCx*v-m*LsFJ`o;Kuz16A)t5EOI7c?=@2mhHL#f$Q_ljbhlVe)WO4QoBHkwp|-IU`mF(vTdI4Sp~C8;91c?|P;jgU0~z zAXO$Hg%$)G`KHl{i}r>C^WWI=HlfAs$KC)H*;grs$yUyp_uuSCFBCVi1>(tR`}ZL$ zt8!B5(k-2(3(n3|x6Ov3ne{-#HWPwgot~oYT1>8%It6tT9cdsGP(-;*oxoRmK=38X_crw-c@gid2U;&Z)tTu!-q1jdbf@^-s82i-WNC|XM#QkW&#q49_j z`*3)(gkVKzsfA|tzVVV3dJ-uqTN+$?#ullGoJTJpjO~dB=M~P{toSucLr_`iJCoZ0 zk$qBrE-DOh-d&*OI-_8BPz>g(`=&=MmOH$fC}Fg9<|{|zm%g5hWj?oF@15dsscg9j zoFfgosUsQFfjk9bqdSd*POdqqr)ehgTB(IY2%O@USpshL{8g1f%1JK=64W0%`45h% zJgOZ$7@ufdtHf2Va~PHVVJR3VZfQaBnDv!83@4lmXpay~;+S$|XdK+sOT~~7IO3(U z{kgI2Y5Y3zv2McUDDJQyKYpjJB%mo!s%;6JmnwwMIROjr5=JNPW^wg+dDCwfl}$C9 zb|GPNzGt@+-@h+EzUY`7{6Y?g)INrW7aWUdWyoexO@`O98N5trwB_70_vt zOFG~lmgwg&q}MZc)BBiTIanlQJ4wJBG{p78bHK+pdu)3t1~ZGTYw`;GxGanU^cinI zpg#Io?PE&?Evt~Rm+jOior17}hxz~m(|`K)*QMVn2cf!m&I_syPvaKt^5vv*bfGDu zBwV~W={_a`c^2cu0S-MV2=mh7%*WpL%I|iS>C2MF3&y~l9{+mlwD3-iF(7!^g_RI& zLRD*gG^)}EWrJX9fZDPxp6^4O0bHzH>siOwskd^$?-j;Er8fAd@894%NY%C2_7~f< zT%^@hA)E@`E<&6?Z#mc}o|(YRX5&u4Z_-4RzPW3uo&j??R@h$iB=}U_=$(07XB`AW zX1>>IPX;Cwu(RT+IR$bZyx@j!_WV)DBHkZ}eJImmpqp>+vP}wY;^V$HG zVe~m#(Y+ygteMVuaoHs81cx&^{nOS`CzL9U46ngaWfqHpk#qc1s`!o3b73l@^VQ6u zoF7JC%AB&DaRQlA@%-$+KY9zYT(H@~QJ>33*5puSRHN26%c$+NM-P71$j5{A$R2KC z?0!@4UEHrKG*Ny>h~_$P>&^OZy~zt%)0N9tmsP5P&@&*z{R)j-JkR1BML$nDlv*E5 z2ZD}(sggP!lC<+MWqdLqXsa;13yree?#JVJmmBUhcukjnsQAi=d61qzAJ4Xb<>Hfo zGB(p_q!TywymJ)baIq%$sEy>?gw1%wHR|3C!C{nYIl0s0+hzSHmqz*|W#-C-ZWT!p zWd&(7F*Pz@HTnoI0EHXycwflD#_bqCMhZ1J$bqucIOUxwegfI5hn#(_0VTH5Zmo#( z(Ur(bV5$lE$o)t`r_pMfa)9*ZrJfA!s+&@eE@%dpylX@CvPtb=s=Mi@yW4VV_1E)f zbq>=F$k5lSI1%a*6I)vyu?t;={)%HfT7B}pPKGCnuc|~;9^&FMqzT9LJc@~`vIS=@ zO$Y1?`}PcC{cBq6o0zVPiD5w2KYff0PVlqcOc1XIRFvXhutt6WWqw)}^KWjt!N(ZU zLaU#YIDf-AwpZ%V_K97<`(50l^Bl)9Pq^bZqcWUx5+tbp?j*W>-f7*BoO;)X{n6&S zrq9{p&b?$g?%P;0=B*|)ocZRdB;NqrD7n+jNnE}Z#oMzT8G z?kBW5XFZq}l;YuV>*?0-(;wmzZzugCXV6zibY_3|2jW?ycloF~G@vC)wZ|*mpBv27 ztFUhB)&{wkZV<=5pFKRp`|_$WBFFSj^J+pBHJcgYe?}YYkxaFi(a&M_Az7X<(p!!! ziyaSln9#bfawq-ASWWb+iHR*E`F^S7x+79e@tbi@DLp$UE_{9aA#Xw9S1uo4Z{i@7 zdXbxc>b5c9D&i17N$}eh)C7b$&M>dNz(Z{Xzezm(MdjXWHS>*Ev9#XWkI+WcsKgM~vVFx}$Z|HvWr-?je-DN$4J z@JNR^FX$Ys_d=6%8P&V5JtI%Uxu*6#kpV>O9!Vk#p$Yd2avw{6`%-E=weRfhmh{_*4M*Q?g=SlG`!igT8`&}ms2`pSykT(c?J0JaH}~ZM zICX?Bf&1X1gC;QQ##rDFOX{w_+i1Sx-3oV#N5{j``z1juhK7Onmx6SkAxU4FfFNDh zs2fRT{|rnxZ%L$@Ih^~$@;QGFLq9r%+y`4{h%WE}9rm`*WL5^B5>Qf_m5Kes(tDtT zH-&{WW6}Y^0}`+9AC`p>mcX}`tN?j8wnKc!tO?-OzRXbi!y>=@->zuq(2S{_IQ^p1 zDdf;YAR;~NhM}W!nS27Tb(V`@K&*O51Y^K*5vU3RHMBk6UI%FC! zH3`Fu>@Fu@9NrVBjngi2bPCH|)sXi?vSH>w!XN%&srbyzxVqwBF^h}928L_tm#$sd zvpg7lExWCVLY+0d{F?uY?Rl1?Vdpv|4o$yDYc`_2kJo4@`+Go~)*Gj7Er)DtoB7ao zRtKu`Hy;+}V;hNHe)jgXl`))a2P#^8F0m7Od}bM|^x6{;WuBe}<2-D-5`FXDt7A<( zYLoPRl6n`io!^$6Dt&D+3vZSJpREV zJ5&8N{dLzzxK0Qo*|V#d5(Mgi9uax2HnOlEitx&;$ZDFqR&u2#)#^*D&zE9FQ~=YP zxZ@D{>mt|N4+Z-lvT(YUwh(8hhN?DwB@Md5#ATJaWgG00~8z@e&~q|}-d3A#}CQ)a}IZZu1s zhPu}Kn=bO(8o4Vxl{UPXeD@E_wf3;kOgYV3%FqhX^?k7qEzK&KDYh!MhL)lKu%MB% zGgy1Dan2J0_Y}l{BpXRL3L&7_0d&=bgFtoFsG{H0_e&d!WeVQjoK3#BDYox(pGln* zQjb0kTIpSaSTpjahI8(gCw_Cd1_;KNo|%B)Y0nE8wd2ETla%*)!8NrBYEM6z7j6sI z9(J~qjYh8vpr)~~l9d?|s{QO00;lZVawEd}@FsXc+?<1+7Ov{<^dry>Q> zIFt)DnuP0Y5bqi#FNKvhsTa?H-FiN*zHq5Qs%41)Qp+YshwOBr&stLpQf-v^&uV1# zK04W;+k(cVqM$9kOq6%@_23@Anx@S@WWyFMjRE6;PU$=(<#Yfed~w(oVRBi;Cz!*O z(I|Gj&!om0uI{cHHmH2x!RkkxNRS)BJx@f011)r*WHhF*+kACqYuGG%d_y{0=)r=M z-l5c}ilf5hLPmCz@mDLlOEP8YmhEPAhHi8ImoRI1ad5;4s85Doa0lTo{k(2{K91~A z2n=gVq3Ymg!t;JtoA~Chj8lB7@4R(Qkk-3sz$(NlbY#UiB=5imr-|FBq~TKIL#F(j z0Vpjla8F5-XsUbWKa~IZ{|f zqvuu42aP1p5-zia*WE(JI;1!B`+rzI-TszCwVVcW_S(kY3rB#W)vG8%AmfDCQ6N`u z0aEDHYGm7OvL-Wv`!Es@2D0&9?B<1!T-;0^@o3!61sE@osnak_3!ux!&fT$ja7K$A zNYq0BK&m{X>GR8LI#dIMEb`~4SrfYa#vhh_TYb8e4lic;0ZEq0w}+&cmTkqb%mSeP z^VBRrcrgm-yrUX{GJu!(-v$2N!M|tluaW$9Gk+cR-<#y`UHms#_@6S|vWG@`t7ge zNLM6)fNT^0>;TbuvdtW%U?lj{`?l3;`=_2aLL9HR7JZmwtM79z^25vZ-_mU68jsnB z007ko+(Oj5d#!oh83 z`o-=})vsP0|+-fDTd!B>1LPWA+n?XUTVexV@jUq zZm-UN zZhbP)EBCjDn1vZ-Du4#l$Rm^UR%kLnn;=oX}B(CVG)Y1i279LM$uz`;?PgX!Lois z94BvN;q9SaGpC2>rX^mq6miphx}tY^|SU_$wUsESrr=GJE^*>Ng(c zn6+Kbhf{_lE=>(}Vb}qzT88_=8>#0KacM4E$5@jm(NqqXI8nEPMH)=US+W}(x{I=5yt+zT}UV|oRUoS_n!?>?--)eT76(p z>w214PO)OWrzfgjdfg#{H!CEj@DBYWJbHJnVdfbU8*I09}q1(;$cB zzseTLX$Of`jiA>3r?`10SJ}Vb^gnXX+3o353i%5-b^h90=adDZTnO$X5J z5PRQt;h7|YHV>7*c2uSD`=)|7%Qb1~xK!_(XT?U5V8l01AYo_uzkGqUnUfgOgAiGY z(O9)HuL-H}b&Djm%6wF1EojG>rE_T|4V?~wh<0qsUvhJL^EOiE@pi*yJ1L(Y@lqJa zoNEL^ntn073#*5AR*zV$Oek(p0=G0B0acx-oRe@U>h6mBN1eDQ z4eE}F(aVM^0fSe(uktIhcbV9n-=u33w?|Am;Wg~RWrHDKP*aeh$GoukCxPq%` zfb-^b9N2-hQUDsFO;51*YwHgeZG6x9`t`zx*lx{$2Y<%x90(9)XLW>Eg_)Unln_s& z7A5k1_t2^4ez}tvL8i`Z0>)fJvg<{OVbm(TGwfozJ(7IDz2~x7`+L4rqp&*o#l|O; zVkI%y)vXC41#6jg9PK5A^sG071bel9-_Ey|loOwMwrX}Z#y;pp!hs#}LJ62^#S~*= z$5n7dq6kgDy_P(^T8r_D9T++PMpOUNN~50v;rfZHmuz~iv*Ti+G$TNi)U_QbHwdgC z0A^x7UR&Y4l7FMU&$yRHf4a!4^b?CJl9HP593@i6TW(i1CtzbpiLY^_hqy5C$)?*R zW*0Cz;nl`dYKqp7yGBGojjKXf9^|}opLz!06D6V7*9M%=vtIAcYX_X`w{-E-WpGjP zc(@c(vGV2b-)@i++bq4AbB%F{P|Oc=j-ID=-74da|EtyC!P*`$4wRbKzM#?(d;r+d zTuX#(L`fcP5)g|Za4NU8IAmvtCnyf_>XJG8=YHp%zT?|+ZBcEQ!#(e?W^$e$BG^}F zcmi~;qH^5JZw2KgF{+fCYzwh`ZvlCg2z~ec)yCt-k>^L^*RnsqRQwm4bWo)^B1RfE z*Kw}1M#z)YTisnR_7axB(E`xQ(Z-Tj7v8s8Z6RTL0y0@y zL5W~SFuutCMC|XsmaQ>vOL~SDt~b{ZITeXHl6l z28l0@^s$7LQzO5*>0mKl)y%#v&=_U#Iy_ObT&C|v(6AjfRpQVNycY7&9(UM@~QuD#OaxbVBaOP9kwbftdG)UKeJjQhsqBfoVx zJErA~w^#2_=b38dPxmjjZ4Z)4Ki1iB=@P#0u4BsMBT5Uk%GKlMXyOx61D*QD7Oqc{ z{+-Qj#*VNC102ERaPhA_Bqd~SlwDiQZd zxs)hIb|wLlG$+boLQ1W=>LH*}kPq?n3*a)7p4>k1?T1>q<>P>cv%WSuf=EyBQMw#e z!D|j;ucstv1F?>6AI}!e(4-~u{`L&Cy3||S!!;VV<3&xQA&PyauH;-+qj z86&rws7l(jo)~zeay}21Zbc%$nr0OKU=Frqj0{XyTv-gVrOXni1_uws-`!eZ^ zKec(F#pr>4reSuLZcqc_7x+XOu*MJ%fTtenZ>-t+tomTSR%18++wfagMvyhz>*UKv zl2SUI=mLyJ1Rvdp@u9TkqI-)niIOurp{t&rkzEAwZqO?EI-@%k6mjky>%A^w9TaS1 zX|pc{M84bQ>y%w^z#o?PYonGTAv9e$XhiFJ{+a+tgXG_mRA$>BSb#GswUw^@bW(27 z=+aezDG3Sqk6r`Qo^$4*E1M4oC z!+XoprKO{*U&{3H6WyDK;g56rNa^u#X+kvUdq!lxq!qduTBX?p0Z9ZfCSmgr7#xx( z=Y@xc5VA1ej+UdK3nQx2l4hh|k+z9MBl)@w1>tM4?1Fb+6`e2*ZJjL=Ei(L4DCd^G zdvNqoN_C*R8!EMCka|HWu11u5mPR$gJLD9vVuZyX)(@4`K>5_(D(ukQ5JSg01}+uI zOa0_SUXc4`rAV0{^HOG>2n-}lD*=~D1Zn(WY3_I_LEETpPwZ6l_@|oM;f?BL$LuX@ z)R@#G?+POh$-H~RK|?RYWw{+(MLZh(m$S2tiz#c5mvi*@{Db%Nu$x}oCBUVjpfj6S zNlkCfK3nmVe#k$F?AQU48r714iv*VFdWmI)6n;wHU20}qqqxnn5zL}vqdy-y&QcfN_T*B2vxF{fkN35NvUby`p8 zGmVSxFzgr0F>Pr++9dwfW^&T{+sJSGUP|Rf{j(k%nic1cu_T-DtyNq27t!we6dV+^ zY}@eJUrhSapu(KPV@|=sJ9KI{Tai&e&~`+yJlm`A6(Z?SWsp zU6b019}gsYA^BK5y!yhY_wTz{K(3@hiuITxNMV(Eikhh2-Dqq_H_Pm#_yBtIBi>p% zMRTU=7J=Q?!omGKB@Q+Q$qtudz7BmQwpp%3@cmp>x1I9$La+W7%PQ>wjBwTWVnFWt z3#sItyGCGh*^mamA^IVbB|fqd4{&vvQY6=Ktt*65FuwaW5-&Ac<3z@U4*W0ey?0R4 zVYe=dz0kYVh!hb7l&X|Oq>F%nfDnoZ2q6>!fgmL|x`0v@1e7Wz^iTt#BVD>A)F2=| zp@aYlp5J$7zP-P_=bSrt&b|B0nS1>s!{nXe7xJ>+wbr|ywVvmbQmBGtg77JaLS^k5 zl6Ai@W1-l!OQ(BZRhJ{3Ji{|YKo_tMj6zKy#KfIZFc0GOvH{7&j^an?v+Ngq<4aNk&i0MxF zJY>D75MfGIp?%JOGrx(lUr^&CP_3H2wP`1--5Z!O&^~xtTYI3|{J{Cuf*K+J_F zlZg{|(=P9!Y5r!_k$xKSbxrP93T-vNDbt@Xn0ok!?iT#oNg08 zqE#vwtojp?abbh1qGGwqd%sDjqBa&D`1Wpm&a<4@dA;o0%lkxrGXu!!ckX#{CW`Iw~+3MnJc7}4eMiZqI*}w9;4m- zQcU(rYD{N}$ikxxHNp9-P#q`Q!T~|`0I5j(^h@f+MC36y9!PhNFBs(@^t*6&u{p8( zExb0dmCji|)jAtbzvg?kDAgl}DX*OxN_=wnhWPAr+D)y8TcXC08qcIU^4_we={=@2 zJ3t61?h5rb>p=pEz8Q4sSc*yqOkW7Fcce6dh)V5vf%~g3bY8hX2WVpQwZmGnp_B6Z z-*aO1HcD*OzLDp?SFd(oOtQj_0h|`mNVXLS^PW4mKBaMi&yy|4e+Z*TkfGcU66H#| zBGr^BZ}h_Lab-0Nlyp8oG*n6XSHgRv-;12jm%G8f)S;9C)y&LFgGs3nJvNIs;fNk^ z)IGS7p}%KJ8j=z4Dmq{$1bJ|w$A)_Qhj0H~P~1OC6`^nfd_#6J$A~!-t^H#)g#qg& zTva=25$IK`S@+B0Drtv&kK{g2%f+b7&7OB1{w-^T;1y6Yj1sE~Z^u3T{AFfke7O4O z6dL1x;la<|P&bd#p^K8Dd>SA8u?u|(n~9%mR>Z0$j8d%!4~N94CE1+$@XeqTM^-C? zM|XmxDZZst6^bXxsHKUIc=esz7Dg-Il5&w=0KVSs?8708hfR!ci!M((CCgQ5T;_|` zO1@v(1bi2@V~!PDI->tObh`9D!FLUTZg6P?~Z&^+;UVDufsyp|7^XpD@? z-~?y^WT=Kj)M5>pb1)XePSAUP)Z%ibY_zm9U9Pd={;ebE7Z@jqQKCc0q*L+Kiq+z= z9F?0`y1`1sS`zJIs#KWS5O(b>d~h0=q@<(l$l`s&Ea|0_uTWBwrS4^3x<#y0X>9$~ zPS11b((urYz3|XE^vKn*_B*hN6gbz_*E7~jPhTa96lXvA-g%=_ckvNTyMoF-hZ3Hz zc`HB2OhMi+A9}9Fg;SBFrx!Qp3%n)#03ew8a>k5~?p&_xzln~X8w3$ev1j05FfFwT z&q%L3x>xjzR1+qRn`s@}bb4ezUc8um_uGY^ABt2IZxrTyrh~UMDI(aAr>O?C+~-&> zDlf^i9c8+3u?v_Ki7m=QAXdjy6tR=#P%V3ffs2mJ&oOX7EFsL9E2Jxvm(bGYGFTAA%}H=va{a zzFuwNgPa3bI^^pGx~OWvZhbhO@GjJVgS zE^$+pA_S|nE5^*}?lRK1`I}g^n|F@I3&{`h=yZ!zbaT8t#qO~~N4u4UE}pja2*LcJ zlMV^|L$~6&8JA8FTrgC-4GxOBhFqcFO7ZnfgV@KNTnk|Z>VOXsr`qaEVhUF>E>{v6 z{QP;HrgPrUsNX_MJVAUsXEDg9I}m8m&pWX`xxjobqzzfVJ{|%hZARw>sYLM&a=|iq ziF+t=L#~ZMzO2+T*?key3cFJKf!eVxXtwid)WbS%xKN@{ug9lg+z^-?IN^e)aZdFF)#)hlqJ?fZ~3MaN99=6UifvG+|BjY(ug!M45S4xCzir9SP46$AiNnox2?AX*_7wjP69Ye0)SFNhRjTgZAz^K`w!bmC& zn8**D-RUJqQQ#y#au!Ku<5~&Dry0U3A1q^AO%&*lX(Yc-_MZ_cE3dChJ`@*f&|tqO zJb0+Os&~Mp`-d)|Y|Zr#U8VEerppMK4nes_EUn<`*dC+K9s6&yL9*4a*NhrQa7$3V z&JUh>cRdy&>SE0mw_lgCFosTL=Ey%tk!hP&m{wDuDq9Vj{j4+SpRp-EvKl`w4Mg!A zjG=zpv7q~9$Z$&B0ycDiBA%kt#XB@k0Hl-Ois-Ku@kPVGNh$E`R>|t0x)1Phb?V)$ z`s9vzH9|c}@j=dK;!hb9OT&YN2%aoKk|c8!fzha=Iz*WWniM83_J2?7i|;axm)ECw z%xK4T3x){2UmR1nI;xNQwgUEDrhF?ks>tZIK{+~$_VYnxTDoVkIGF41%wY!49CIj} zKA?~OxPB}MT!DZ-A})ub+)|f*C?w(WdWBUHufoP}@(76_nSPN=MJU}z&XkL7cK{V-(y;;^b+4ji!Bq8O9$-TtiWTx)I(Im!CKRJ9iivg$(i&8->9_0@S*gQFW% zHBu&!?@|rMfW)4mrTZ~gjZnBOJAQHy%}ZjZl%!&U0o@y-p1N*llAXt$U*q=MN^_I5_ig!!kw-7 zfgoV29Zpba94HryQ<$!*=E`b4xIgIS+AURo&e86M@^-WHXV6Em>_5muS(*f?w@G%b z2B6jq+0iQIv(&r8m_Vk6Y?m@SJ4&1ShsoS9J?X6YnIqGRk3y5rMB-iM-X=NphhzXb z$hJz?L0!|PcGL2i<)_2wP3y&h_>}E!8g>6@6t}0h=CGsR392j$mb_0%%5%Z;hlsi8 z5VR9YDGQn;$eYK*TWDw?lrQ>uZOyQKWfeS5!V;C<^0F$^`HcU138XWFyN=RNQX)5I zHiH-yHpCWn+GbhAUeQvkt@4VKYrLloByf(h1|r5cAAP?5?xDwn{wSJFiD?_?bdb#4 zU=(n>XiCcqPU{eq4;z@ZY&^PON^Y;R(Dz!byfpIU`@%0XsfMM2GY^9&5c~DpF!cbN zz-S|zU6_qGTDm(Z@t_IXOT)T_ZO^+Qaez!~65- z(qeD?NSxG#=N0E z@pGE@8l0NQef%?{-rq=LCy-zi+9VpJNBaa+sPa29nf4f6Jx>I=lucYsxlgoC=ADh$ zz4V~f_ga-Nqr6YKFY|eu4k7cO*%!evfDcX!k|QZ~A%6minj!INdhy`%{TGJ$UO86k zCue6KsRx|i=jvu`t&)<7kBM+H*KzvU)4Tz>XKXOt_5mrI(l3s3cPhQ&F+Pp!wO27P z>Ml)4%%%7!OyrxDE*W(|CZm|6L1(D;CBMf(0vP8)K)VxsVPvmJ<`RXMk;U3_a_y!{ zVnuZsTK2qsfM%t-6r927yyOkRa&k91cZ^itSKaVln}0B6yU$)L*aDzQIHXwk z5|4nMNRWbnPTPK}!g6Gj_-e|KwT&f)xd0bnzQ*va??k*H2rg}*Qa;lZm-QJWHT!&G zcAabk39H%(zMk}Of3_b&1ENm?(tFMC_z5wk&Bg)fKz*G^btYBWa@E%Q6VHa)e_d5l zL~){j^-6!)foeU|p4#L(er)<7QRI1PD)#Bt1%M-{{K$&RK;n*4 zWgq%LmthLW7XAlEbFtXGkRC``Y3mjXWkREA!~>%+hoaY=@JKP-k>1r!#+^QS_^Ba|6@+eaDWT;G>!H(=(cYOqO?WXv2v|`CZBYwlefLgz${}{uUxr*Imy>o zdagWYAXn&=%rz#nds>qD<4jpvmtU%1x*UscrJkaF0)viw!F)imHavv&2Njg(ic361 zFIInY6Vi_^Nb~vrlgeqanI(}GYG%eF*YM9V*HWN)(s3{re3>FZgcFQj*;DLD$~#^A zg3_e0>@4-*xpYG{W|M$D`rc@WEVWfCnMd8|^LK8h+e=Nvtja{`nTC$8EiWXp-Xy7b zqta;8AQ+dKimfziD;1zx&N!{%mRX^g=?)O91!G2bWg-PdiItSfju5f2tE7~$g|cAq5~1U+o>Z~o4=otqKcf%t{83Nl1>TENf#tN9i9F)QnHACR`OcqXX;%ZjrDrJ%ifYJ zG7+8_XJ)#fqV)&g~(n={zL+q z`~V2MRqe7`$id*vrES8a>j?WLEw=&pVfT{WF+~LrI>l@%FT{neKimI88pAgF^O7lGMwv^{7P9o;zux(gMj&ZSgKu|a!ORy7V9jW zgcxi|mMWxtLQA%HjMiLW#C!-a%_?>>z{a1v-1?aJaI4Lhbr{3rtkx6<5NbZ;R+63R z=S;7-6F(@Kl5BpFPI^;cPViB@p2&ENyOr&*_qQrU z6=7AB_NzV}6cByL<|P016hdh@h_{p~w1BdjvE!lSTalO|Ju22Rt_*J|L_d<+@DPiV z-SSe7zQwS1{z=|^+4*-9>IA#+`emwIcm&jV2rYH09a+WPWgurmJdO@|t)|^GDN`)M zrVpZ4esJ6E_ORgtXsT{}zk?xh5LhLxw&Nj8;e#PjYx4 z^POK}^P=`f_U+~@w#Cm*)M|aI1dz4%x4;2*Yji^`daQbuk{XZAT6*DIVFgF>Q$~nc zx)7JySKLj4#KX2+7+7)~ZSw?5f9E~EaUkYfk+ne;9WOL8MLNJaAzrUk6>#WLRBK|~ z>kt-esuD|n82UWgj(!#A#%6`eP0n>w)yogqa9#N27aAe^rR{REqA84{+?<=HI?t#QujB z9r9c3&H`{+f^`6*wy{F-lcoP*W$0f&?C+=l%fI69&i}Q#zbEdmxB2HO`0Hu^Mw$Qf zBCKhfQ1aytURh*l`G*6xYrFqedq45dXqBCi5DZFe8e|-D* z+@7Kv))~)oj(FF4rxl{|R;j8z`?!hmWw1>8kJNmUC4RhW z>!n$ijrtZLo>Y&yKh}y5V9}B6X1^j^7d0=G!tm^q4m~6c>6#9CI3bUBcWx8BPV~)a zJUzh+&Mwh{b5?&V*K~e-+O+#R^DjyLibbK|Z$KISmlJCFsm1))RI3DfYq+QhEt-E{hnV$(aM zGkc9?BYR;tKRH}1P?^$L>MZ{4Arx`L+EGrThMou~kDrlHcp5-23OvfP2;eyk-oa#b|j2QBRhr-4|Fcy!2&4Za?LyWmCdJ; z;23r9Pev2uiWaX;ItRp&P~=%GRs$tl{l2I+fQCWKON85mw>tDrCYa} zzRe8`MqMZ1V@Az~_17Dpd&7Eq(}p~-uB8{Os0Kg{+j`1y0z!;dia!4N72HL>JRfqr ze(=_*86Q(&R%GC7DmZ zy2K3wZr$3}g*>+7M7Dk^|KT)I zbBK%4&#v}NE-IW5FrrUEmUI;xI4j1X5MfIJo|tPK^eln;DxN|M?^Lw6mUZOatb*$G z7QZRzpGi1!X~Ws7LAPBu-~yv1H55>TBTDhc_% zZ>tFKnC7>oqW0cCpz2T9nUy8WE1jGS>2~G_dG7H=QM6k0J=;YVC2y~u-Xq@GH6%S% zCl5!JqX-cfpra^v_bd#ta~a-WV{9NEZ|vYf(mDDN(VXJLnrxFP+hW@DaNZ#C?owxpI6Iss8%mx(>^>wEk8@0?LTqjgb{sq6r?+13|MDI4xzEUgi#5cZW2 zA)*Qbdf&jq3)RAWX!b_#OKtf2c7LtM-8@CSMDiVnGcRvIHawlPvS>UpJrZNZLbT&OAp=8)ye>y%@~ zmFrcYFuGqBz-FzYe+HXaAh=P^e4{)?OzWqT_uXV03D@k3`u(Koh5Uw|nxJ7ylcdbz z)YfvRc1>l1yNxBDU^rksooMQZ+wi=ay@>`;`$Oi?u($h!!Wz4Y(~R-V)n4+0xLBSb9&k+|LZ-9+v^Gm%m;m zee5W^<&69K(VO>N%T`o~piP>dpcv)$?n zFDs_7hVAy8H!zN0)>KKzJOMXn|2=d1hy1V6U7DRHQPX0d7aJB&!09=Dgrs}2wJ`qp z*hy@0uaLWxctK!NdzX0IEf`k`Evnjb8Whbdvej41L186t7dqVBWKFsShU;si{Jx-q z1ya<;NuG#0DO%h;T3*#BRZ#*>2Rgi$d48J0HL+3OnN~#mJcYmW1m=*afT`5{y&B7r z6SE^+a3V2kN9J_M4?zeqt|M5fZANGqqeGxMJRq9YL|h$`FM^(*>0E_RQWtk}5keO( zUix}sX^L>8S;f;en$ossK{!H;>ySfY=%kg$S>L4(UE{G7u52J zyWh{wUQdzQPjDU(e_UYrAS!yvpyVm)d2xJvLWTFzQK0zIv}S&Efx4kVKA%m_R`J-b zSG30syKd*cUE~D?>_*^MO2YjizE<4TNQ^@fJh{@g~}3JR72J4*j8fI(vx<9+zk4`MphXupnXBTsMjz*;*So@6xJKi*!kI0($=bG%Tgb%g%c#FHJW4M)@~iCd}oyNnV~z| zb076MLM<=8UwpJ&LPTyjeL>_A068ZG+DVm0uB~=I+10YPi>tiBn25M%Q5BV6t^f#C zVO+JDsQPz7XObkD1xs14w{E&#H}U}u;T|L)S)eNuBbUofbM*4@9nMP0DBE7?< zVUj|)v%)CP2)oL-cF*W0W@1cDgo5YAZWIM+o?tX4Gbbm3h`y7q>90ayElu#1kYMMn z)LPO$|2XH$nn{@1Ez>wz!|{}T=+X-a8fFQva+^@V$I7oQ-6%5gkT#xy%;=$bN0Mr7 zR>(7jQsTW8M1Wm;el|joi`bc)v(hAlN$7YXwOI$Z7pQU%IPDk1Yrs$*$u#-4q8O?! z9HK7FQ>a|C@imL0(`tKt6r(qvc?sdV(-PLPCm^;;4^106^Q*}WLkdm~$ojz3{)S%Y znY~w~iiQtw+rg;#qgJB8j@H#Oj4=Eg}2j0AyXwK?9E zb@@!0`p=u;WN|%tE-oHVAptdvn25|iKp1OU5OF}!_KAy<7@ zI>?vy0Xu(_Lh@Qbs9qpq39_p=ON*&zOS9Hs&o>p8H zY`tPkfoPDfwo9U*Kv&;I{BM{$AM}y6^Y;vkhKw17or`6U_CGOosoW^mj%%C6!P}7R zRQntk+}%}^{^H+0YeV{?wUI)xe2M5}o>MF<%J&!VaW;!j-o6BOB02yJ(4j~2znl0o z+dd%V+O1NkLa91ozIZ8}hqWVf`Z2df9&Ngpu#G!E_f$4>nGqcje%#}K4Lv_Q8P^Fp zgE5|uW1$!TotbIx)^md2-^1a#514n{IpTzKCBJg41k;o@^ z!P1mWOHaBW0ZQb#cOLFrOQTe%%h~Mgsq?}$AGKc(+k2g06)ECQJ1ekg^Yd|U)w6(m z1r}*ZkA3bJ%D=SRzQ> zSK)JV$W3fTeT8oNyYlObH}GZeIqBF)<$U_jbTm3N8voFht{sk`CJDUAb|cQ9Q>2uR zS$3eSg#*x3IMs~25G1>8XSG(16Zlp?hRLj!hbd%ax;d~y&!3R)%PqZqs&`rP4y|B| zDn*>_RPLb~k+|Ps0%$1=$;yJ|^Ea~)9+J^T-!`)9hkJkGr)D3C>}WT+VCqcyI`4l< z;y=s}$2zX;%}%m+%rdJ&kub4R(iw$F--aVm*b``swq=ggrT5o-BONs#G@mT{*356g zz7RJlM}Ta|nx0uVp>h%O{(v>-mNy)mtf%TfzL&_k3HG`{_d{P$0ZZ? zi8URfmf9Zn_>3g-7Y18WBWxSsH#|#W07iQf>~@D+v^X9nau{pT{m))LNY3zRP>WCg z>m1@<6ZZOt91GKO%{uV+v|3W7U+Q_)n<0c%^E#??cyJ1r;^fxqjmBlomMJkjnNhm42qG z97zCUn_KG(M8h`idg9nbTnLkvXM}W#bNIb+Vc6~P4`!*um*5Q$IJUkpa zDaqE;3jiEy=aHr~wowv+nmh16#tKFqUd5Yx*UNNzMhJ>g@_mT65Crms-LR^G58QX* zmYW-&d8Tgk{mN0+w^E*Dk$PFXmQzZTeNMt3l5g}~e38GtND8z_XhE`6ZE?Cr6BteB zpEmerTd!t`&!;_nCH1&543-^fF8{g3hxfa%*4^KI-wIoFE@3N{<8~peZ6p<7;^D2b zXOso_a_zW+l;DDKU7S|Nc;Kb$jBovmJf7M~OF}JAFO43=-)(Mx)neE-AJ>B9p&0f9 z&?U6hw0B|J;}6{<(iBXzJ^G{p>=zWjvTmkvpwC%sAo4BaNlv_*_OobaoVOu<8~KYI z)|6^U-uEnhD7ZLx=mS3ny#VeDk{09R>%-gWatq8InP(6>^PVwFBnVa10?9*2o>sjU z2zPML`>j;NYc`(yB-uxc!Jp~dc(3PGoO41STgq8k#Zlyi5XB(9rr4u4#Sl@_0JZ>8 z!N+GnZ%)d8bM{Se_6*_a*fQ7blPWe#t7a+NtB7>R+H`xlTF-=Pipr4!<;bX(0{cQ4 zMqccnN4}9-a!F186znDTm2lgkGPM<_Uledzvue^)HFW&;GMldY84hVgpMje6{?Bkg5cW#r8!iuH!t(qu}0EPH*Cq?^U9~w#ufei5>VIJ)HE_| zTPZ&guQbrpGb8zu~%8Gvl{rEU!A=L0x~P zp?l}3%jMM7Qaa{468CmWC++4rCjg52H(XX{rb}Y-bW{9O;JLqtyQM#6b*c3wh$a;& z3*BffDK8O@e3p6LSX$UlD_5$$L1t}lV$gOTI8*QbFX6fpl@4Sjw(Sp{09A8D)vg4= z-GUp8mN&jMf^OL0!Yvq=ZU3wfuwV+)C0EKZ7T?elPOym)P++S6FW2hk{PYbWsXewO zNLbSwk#8$ETlF=1+$xR_+SHx}jTtO3QV|eU3NiBiWInh#ILlt3j*+{FCW~@Mc937l> z*Vwq6$PHOJk-U(Qy4l}q|IIQj6I+(qdBHg-NL?(^;RyD2B}LGVJV>pQOWUe= zFO3aeie!et@tMq`f*~*|zHzR_8l|qSzykdAi`nY%@UX#jo%1;94atHTK;*7%^Qhm) zwW=oAsI0rW#AhDVLC^dSSeM^oTX{C4?obfPU_5=LxTr$=-R8wU<*&bu0D;#K_N#4a zDARdU%7v=B5NXk=a~;DPRfxAOWPjci^7ydRA3Er}=y;34Z9ghrGM1K9`g}vOm24Vh zGz!&>w_!Im3a>A|Z;+RE_UIMjhZjI!ymK28@a1P2ZE~lo*5hWb*$&tR*WEdc6xiTv z$nQzt%6XVxRr#=T|MtkfiOEiV%v8u#PrrL)uL(;nJ7{Sw#(BX&Zhk*Jcg#01v*OVm zQs?%8r`UMBR?Jxr_TKu~&(0Z==1ph8TonJp|2_}$ycVMXumPJ3a=7zu?{qYAl*hUY zKYFy8`8+{dM7LC^Mi!Wk43-PrCERmH)HNVU zx&=Kq^-7nARJv7~pis@aqzO4a8wKID7m~jqFJ~FLVnZMZe#)|8-I!-pTo`EzkB3K$ z^zxPD8B~qrOgi}T!$H7I3r5FzfKTE3@c>w$daUUx;>D3A<>*pm@p zF0Gi8DHlZwevu|_3m;>ho^373|P$)M@b{=Hj)O4592jR++7)PonbPQR=HVi3RaECa4>muHN0Q3)wAFB8{LoDAXlGu4NwOFWXZ-8k0tapB zb74^jbOvM3<2qJCW3HAcK^Jy?b=mi?2zM!-516{#%V~Dq>$6Ws&>I5WhPj{gZnpBy zkK~DC76AhpfR4)f=){=ncgEPE6Wa#FxQ#%sozQDox-N71c8a|ZVUVGps7dkt#rrfX z=*gjw-L0Pj%1D_b3(oE}rm{Tw(Fv^##i{?8DmYv4rC-l|Ues~fJe&AL2li~A; z&++FJh;hxDKT43!P#YyeX2d#`hd}dbwhGP<(8soI@x~a34Y0?DFz(fx5^<9Mlc`tZ8%92k&t=Q{sm+NDaa7a7yO_X@UiDl`@e?B!^B!oz-e#Kf$i%6 zI_9qYt)N*M$ac?HiaQZ+O)_dn^1E6srJcL8#Q2bKqh>o!fY%M+Mk9HAa$;OJr6`%%2=3);Jxq3hdp_rty%e7&&EoVry`{GK z^zI%B87+9JNhnBD`DlXz4Cc~%{6sX@LYx$@wk6-a>H8+Gf-l>BX^S?~caSNz>K}PF zE}!B|5SgVn=cK5}HwoJ4VF3Bw%k$j0U%VqfX^Ah-S8N9@)og-57-rkV+7Q_Xi6RrS z`NJf`d)$T=Lo*hO)#DSUeX)V!Jvq*N+xJTC0=^k$na7~bqb#r;KaP~^`U45R;o%`7 zl)%;zEk9g%r2Ce%yQ8Kl1KjP3>^ad_rw@wWo%*pXaO_0MC&;`?^?y~>2x{TVJoT06 z(NH4mYSl5tVhwOPV8Vvd!9xtp0+@Yfo>r0gt~cmtZr!nh<8C<>MCKgkEW+bTSwbh- zyp#U6+n3n$XpUHss0)@emy$TmgrBOgn)$rErV&00Q{Ll`-``CPuuWVR*@p!VQEXB0 zN_k6s`lF$!=*E%-X~(p9^GQV%x!TBJu)-xP>cHps>nQOrC;xA}F5;SU-iu##y}bBwfsRS9xm22H1hkEmtlC~43DN=>&@Tl+CW6$mL9F1E(sZ>Y z%t(tbJi@WEzV>D4bBUheq5^TtyY=O}rJR(Et`;!&sBgN-oaIWYL2*rrPtx_vVG7cl zcTRs6zxwipMK_W@L>M6lrJxoJbc61X00maC=gDbgtLcJkBZ6q z4ErYQQKxI|D#McXIKQwiOZ!d1(1B`X18swlYqm7$WE5|Uc6V?6!BG!TK-(DKcwV>5 z+-wwEpV`GIv$1c-r0O{r3=VwcdA-DUE#`{WD4TI@0Ng3qd-d*I=1tQr2FG6v`Z9P) z4U$qvTtw4(V&@wIl8cfupPQaE{uQ1tm7^E5Zy*sNl~-3ACYB7jhUwpo%3qwamW{hW zaVWF_nrdta?%ZT6QX@8&dP7xcvC9iZ>Dh0$75K52R2Qc5JlgpEjBb4JhkQTu-QP_H z&c+{Jd|SN)LZ67N%_TXJ|Da>I^FISp{<-c2;RD~YqH>Syex=2fP_m(zM`=z{es(P@d}uE zfrY93bKWZ1l9JgKvgJFaxU^0yyOzON$VH4VA@C>|)N*L*T5q0PI0H#?$kF#dt$m?H z%rpMds<9Dw>!|Qby6VnG{0E`MJG#J?xEWvG~T4y?wkhCV5=Il zc`SapwZ3s9=P)*sP|Ap~gQ78w#QjLLuDkXpf(zJ;@yYi$ zI3Zzw=-g>QWw}jIq;W!3XaRZ7RZyV3d^9YsNZncVIR6YaUgL>-D5{TCl`zQO`av~P z{5{?GuZfu)6q^NRCV+*NftD5mT9Je8H(j9wsIpUL+fe13K?gMz3BfVf?n>wadHzjEa; z1mu?vxWR%;|4O!Gks2~Qi%tQCAf03wp{s)8X{M=x8%u+R!|NhVjrEVn2l3~Bu)ObT zQro#_A|iX)*<_VDcLJ#Cw*W=|U%gFNkUy_scNM;RM5a~oZ9lE)^u&43&O@RS?v7xD z7nlX%k@?l-!4C7^*Ksz|?Kib*KTccUsr9$y6pqQTWH51yd3}=ye2s>#;)6?Z*_{WV zcqF?a`fGk970aOGVl}9rKr@@d!fR{jbT!^a(%)aYrr7DHcpJhO^puDNdPkokdaX$0 z%6x>KE(!To9#LhZVf;~@Dcj50s~~{7`gYu~bHcxThBc~QTKj03@W`@6lXGpUfq{JE-G zJ@}SRik_s>qDN7wG&Xv7cx^g~w%h@!ITya+F5gWRo!{aL(vSK>rykwGUK1vSMy^jw zTOTNy1{V?-4$?#?83LC*#qMkSrLL=*0)&asHoMc8DQ{CyMF^h<=<50rk&%SKlt)L# zT2)cgw`N7QuIom3T4{Cjf7s4nM=hkwkOdpI!IE}5b4PJ(6ngc9d%Q)(GQmC`6UQMb z=U|>!lmIT|?60ynKe{vbnk{0CLLucAt-yRO9OQ=NqzYFPJ)=}DbGu;ID_}V9nJ@8A z#nbpT$bsRH(>`gHiv$n*Rk(NjuFERTi>$jA0M;y_@x72wbj@0B&};A?g4%u_h9U7o z(7f4U>=~qsq~|-{>bmS{L}0rh1IZ|i8b115;2BO~oyD;DaV;ZEt(}=8!@96IzNVic z>Tbmc+gEm(v(<{_Ss_E!c5GDGJb+8LgHJWRX}0BxvmEke8Lz4FqI?;ARz2}RQ7=lR zRz3bIpTPHXF9hl{AhgI3PH2!m?d$W8L9Ac#zm;`f?QZzoANulABM-)#Q??2d1DF77 zV`uh~$0rT??o|+2!+6{lt%_koKU&>V7kTpG+vH6{(zfWkJI-V{g&`On`@<@4u0v=`Hf|9-)CX`bztu@24=HVfCX z3m)Ien-APgu=B+Zj^u7Vuls76eoeXnf<3Isgj+3g7+a?Pm*R2&CTF!KJ98V29T9rD z;Hv+4yPh=g=>C_TjJCs3kUfc+c1GTl%GvveuAAcoTI&fdydAR30YG#XGJohqjL9It zRDPHOFj0#&mipli-$QempFtXc=^6m|+Zk^_Tt@4E;VEXB1T6g|Vz!MpHl{#x2Zlpc zC8rqYl(Oq_M462$v-G~#?Rl@`U>4Fw03BX*nWadk{Gp2%KVptR{=NnPfhBgwx_N1a zYtIzi+_Fa8zSh*l2)&->y0xGls&$I)#BaI+IQKeO2tXF|059~q4ta1Tu8b0X{5n + * Example: `kubectl apply -f container-azm-ms-agentconfig.yaml` +4. The configuration change can take upto 15 mins to finish before taking effect, and all omsagent pods in the cluster will restart. The restart is a rolling restart for all omsagent pods, not all restart at the same time. + + +## Validate the metrics flow +1. Query cluster's Log Analytics workspace InsightsMetrics table to see metrics are flowing or not +``` +InsightsMetrics +| where Name contains "envoy" +| summarize count() by Name +``` + +## How to consume OSM monitoring dashboard? +1. Access your AKS cluster & Container Insights through this [link.](https://aka.ms/azmon/osmux) +2. Go to reports tab and access Open Service Mesh (OSM) workbook. +3. Select the time-range & namespace to scope your services. By default, we only show services deployed by customers and we exclude internal service communication. In case you want to view that you select Show All in the filter. Please note OSM is managed service mesh, we show all internal connections for transparency. + +![alt text](https://github.com/microsoft/Docker-Provider/blob/saarorOSMdoc/Documentation/OSMPrivatePreview/Image1.jpg) +### Requests Tab +1. This tab provides you the summary of all the http requests sent via service to service in OSM. +2. You can view all the services and all the services it is communicating to by selecting the service in grid. +3. You can view total requests, request error rate & P90 latency. +4. You can drill-down to destination and view trends for HTTP error/success code, success rate, Pods resource utilization, latencies at different percentiles. + +### Connections Tab +1. This tab provides you a summary of all the connections between your services in Open Service Mesh. +2. Outbound connections: Total number of connections between Source and destination services. +3. Outbound active connections: Last count of active connections between source and destination in selected time range. +4. Outbound failed connections: Total number of failed connections between source and destination service + +### Troubleshooting guidance when Outbound active connections is 0 or failed connection count is >10k. +1. Please check your connection policy in OSM configuration. +2. If connection policy is fine, please refer the OSM documentation. https://aka.ms/osm/tsg +3. From this view as well, you can drill-down to destination and view trends for HTTP error/success code, success rate, Pods resource utilization, latencies at different percentiles. + + +### Known Issues +1. The workbook has scale limits of 50 pods per namespace. If you have more than 50 pods in mesh you can have workbook loading issues. +2. When source or destination is osmcontroller we show no latency & for internal services we show no resource utilization. + +This is private preview, the goal for us is to get feedback. Please feel free to reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com) for any feedback and questions! From fea4ffa0a602ddc3428be8796dbdf0321f7c2ae7 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 5 Apr 2021 18:53:42 -0700 Subject: [PATCH 084/194] telemetry bug fix (#527) --- source/plugins/go/src/telemetry.go | 2 -- source/plugins/ruby/in_kube_nodes.rb | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 48f82a9ab..461fdea96 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -153,8 +153,6 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) - } else if strings.Compare(strings.ToLower(os.Getenv("OS_TYPE")), "windows") == 0 { - SendEvent(eventNameWindowsFluentBitHeartbeat, make(map[string]string)) } else { SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index c057f7c2c..8a94a7245 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -9,6 +9,7 @@ class Kube_nodeInventory_Input < Input @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" + @@osmConfigMountPath = "/etc/config/osm-settings" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" @@kubeperfTag = "oms.api.KubePerf" @@ -301,6 +302,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) properties["osmNamespaceCount"] = @@osmNamespaceCount end ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) From e31cc8715c0e6fe49abccf0238a973d7d3a24ed8 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 6 Apr 2021 09:37:20 -0700 Subject: [PATCH 085/194] Fix conflicting logrotate settings (#526) The node and the omsagent container both have a cron.daily file to rotate certain logs daily. These settings are the same for some files in /var/log (mounted from the node with read/write access), causing the rotation to fail when both try to rotate at the same time. So then the /var/log/*.1 file is written to forever. Since these files are always written to and never rotated, it causes high memory usage on the node after a while. This fix removes the container logrotate settings for /var/log, which the container does not write to. --- kubernetes/linux/setup.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 218e3c717..ee3756964 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -85,3 +85,7 @@ rm -f $TMPDIR/docker-cimprov*.sh rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml rm -f $TMPDIR/envmdsd + +# Remove settings for cron.daily that conflict with the node's cron.daily. Since both are trying to rotate the same files +# in /var/log at the same time, the rotation doesn't happen correctly and then the *.1 file is forever logged to. +rm /etc/logrotate.d/alternatives /etc/logrotate.d/apt /etc/logrotate.d/azure-mdsd /etc/logrotate.d/rsyslog From ca8fa1274b7bcc02a45cf8a8b195ca8e68f52bff Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 6 Apr 2021 11:11:17 -0700 Subject: [PATCH 086/194] bug fix (#528) --- source/plugins/ruby/in_kube_nodes.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 8a94a7245..d4b54c340 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -9,7 +9,7 @@ class Kube_nodeInventory_Input < Input @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" - @@osmConfigMountPath = "/etc/config/osm-settings" + @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" @@kubeperfTag = "oms.api.KubePerf" From 1f6f6d2578ebd534d5f0c98345ad147526b73821 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 7 Apr 2021 11:47:14 -0700 Subject: [PATCH 087/194] Gangams/arc ev2 deployment (#522) * ev2 deployment for arc k8s extension * fix charts path issue * rename scripts tar * add notifications * fix line endings * fix line endings * update with prod repo * fix file endings --- .pipelines/build-linux.sh | 5 + .pipelines/pipeline.user.linux.yml | 7 +- ...rom-cdpx-and-push-to-ci-acr-linux-image.sh | 38 +++- ...m-cdpx-and-push-to-ci-acr-windows-image.sh | 39 +++- ReleaseProcess.md | 19 +- ...ContainerInsightsExtension.Parameters.json | 66 +++++++ .../Public.Canary.RolloutSpec.json | 29 +++ .../RolloutSpecs/Public.FF.RolloutSpec.json | 29 +++ .../Public.HighLoad.RolloutSpec.json | 29 +++ .../Public.LightLoad.RolloutSpec.json | 29 +++ .../RolloutSpecs/Public.MC.RolloutSpec.json | 29 +++ .../Public.MediumLoad.RolloutSpec.json | 29 +++ .../Public.Pilot.RolloutSpec.json | 29 +++ .../ScopeBindings/Public.ScopeBindings.json | 125 ++++++++++++ .../Scripts/pushChartToAcr.sh | 181 ++++++++++++++++++ .../ServiceModels/Public.ServiceModel.json | 159 +++++++++++++++ .../ServiceGroupRoot/buildver.txt | 1 + 17 files changed, 821 insertions(+), 22 deletions(-) mode change 100755 => 100644 .pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh mode change 100755 => 100644 .pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json create mode 100644 deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt diff --git a/.pipelines/build-linux.sh b/.pipelines/build-linux.sh index f4c92fda2..53f6a3a07 100644 --- a/.pipelines/build-linux.sh +++ b/.pipelines/build-linux.sh @@ -14,3 +14,8 @@ cd $DIR/../build/linux echo "----------- Build Docker Provider -------------------------------" make cd $DIR + +echo "------------ Bundle Shell Extension Scripts & HELM chart -------------------------" +cd $DIR/../deployment/arc-k8s-extension/ServiceGroupRoot/Scripts +tar -czvf ../artifacts.tar.gz ../../../../charts/azuremonitor-containers/ pushChartToAcr.sh + diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index 57273111e..565661d64 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -24,10 +24,15 @@ restore: build: commands: - - !!defaultcommand + - !!buildcommand name: 'Build Docker Provider Shell Bundle' command: '.pipelines/build-linux.sh' fail_on_stderr: false + artifacts: + - from: 'deployment' + to: 'build' + include: + - '**' package: commands: diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh old mode 100755 new mode 100644 index 3844ea185..e7d26245f --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh @@ -35,12 +35,22 @@ echo "end: read appid and appsecret which has read access on cdpx acr" # suffix 00 primary and 01 secondary, and we only use primary # This configured via pipeline variable echo "login to cdpxlinux acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password $CDPX_ACR_APP_SECRET -echo "login to cdpxlinux acr completed: ${CDPX_ACR}" +echo $CDPX_ACR_APP_SECRET | docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to cdpxlinux acr: ${CDPX_ACR} completed successfully." +else + echo "-e error login to cdpxlinux acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "pull agent image from cdpxlinux acr: ${CDPX_ACR}" docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} -echo "pull image from cdpxlinux acr completed: ${CDPX_ACR}" +if [ $? -eq 0 ]; then + echo "pulling of agent image from cdpxlinux acr: ${CDPX_ACR} completed successfully." +else + echo "-e error pulling of agent image from cdpxlinux acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "CI Release name is:"$CI_RELEASE imagetag=$CI_RELEASE$CI_IMAGE_TAG_SUFFIX @@ -51,13 +61,29 @@ echo "CI AGENT REPOSITORY NAME : ${CI_AGENT_REPO}" echo "tag linux agent image" docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +if [ $? -eq 0 ]; then + echo "tagging of linux agent image completed successfully." +else + echo "-e error tagging of linux agent image failed. Please see release task logs." + exit 1 +fi echo "login ciprod acr":$CI_ACR -docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to ${CI_ACR} acr completed" +echo $ACR_APP_SECRET | docker login $CI_ACR --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to ciprod acr: ${CI_ACR} completed successfully" +else + echo "-e error login to ciprod acr: ${CI_ACR} failed. Please see release task logs." + exit 1 +fi echo "pushing the image to ciprod acr:${CI_ACR}" docker push ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} -echo "pushing the image to ciprod acr completed" +if [ $? -eq 0 ]; then + echo "pushing of the image to ciprod acr completed successfully" +else + echo "-e error pushing of image to ciprod acr failed. Please see release task logs." + exit 1 +fi echo "end: pull linux agent image from cdpx and push to ciprod acr" diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh old mode 100755 new mode 100644 index 095a00039..19fe55722 --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh @@ -34,12 +34,22 @@ echo "end: read appid and appsecret which has read access on cdpx acr" # suffix 00 primary and 01 secondary, and we only use primary # This configured via pipeline variable echo "login to cdpxwindows acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password $CDPX_ACR_APP_SECRET -echo "login to cdpxwindows acr:${CDPX_ACR} completed" +echo $CDPX_ACR_APP_SECRET | docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to cdpxwindows acr: ${CDPX_ACR} completed successfully." +else + echo "-e error login to cdpxwindows acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "pull image from cdpxwin acr: ${CDPX_ACR}" docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} -echo "pull image from cdpxwin acr completed: ${CDPX_ACR}" +if [ $? -eq 0 ]; then + echo "pulling of image from cdpxwin acr: ${CDPX_ACR} completed successfully." +else + echo "pulling of image from cdpxwin acr: ${CDPX_ACR} failed. Please see release task logs." + exit 1 +fi echo "CI Release name:"$CI_RELEASE echo "CI Image Tax suffix:"$CI_IMAGE_TAG_SUFFIX @@ -49,13 +59,30 @@ echo "agentimagetag="$imagetag echo "tag windows agent image" docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +if [ $? -eq 0 ]; then + echo "tagging of windows agent image completed successfully." +else + echo "-e error tagging of windows agent image failed. Please see release task logs." + exit 1 +fi echo "login to ${CI_ACR} acr" -docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to ${CI_ACR} acr completed" +echo $ACR_APP_SECRET | docker login $CI_ACR --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to acr: ${CI_ACR} completed successfully." +else + echo "login to acr: ${CI_ACR} failed. Please see release task logs." + exit 1 +fi + echo "pushing the image to ciprod acr" docker push ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} -echo "pushing the image to ciprod acr completed" +if [ $? -eq 0 ]; then + echo "pushing the image to ciprod acr completed successfully." +else + echo "pushing the image to ciprod acr failed. Please see release task logs" + exit 1 +fi echo "end: pull windows agent image from cdpx and push to ciprod acr" diff --git a/ReleaseProcess.md b/ReleaseProcess.md index c6f51bb65..8ec91546c 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -35,20 +35,21 @@ Image automatically synched to MCR CN from Public cloud MCR. - Refer to internal docs for the release process and instructions. -## ARO v3 - -This needs to be co-ordinated with Red hat and ARO-RP team for the release and Red hat team will pick up the changes for the release. - ## AKS-Engine Make PR against [AKS-Engine](https://github.com/Azure/aks-engine). Refer PR https://github.com/Azure/aks-engine/pull/2318 -## ARO v4, Azure Arc K8s and OpenShift v4 clusters - -Make sure azuremonitor-containers chart yamls updates with all changes going with the release and also make sure to bump the chart version, imagetag and docker provider version etc. Similar to agent container image, build pipeline automatically push the chart to container insights prod acr for canary and prod repos accordingly. -Both the agent and helm chart will be replicated to `mcr.microsoft.com`. +## Arc for Kubernetes -The way, customers will be onboard the monitoring to these clusters using onboarding scripts under `onboarding\managed` directory so please bump chart version for prod release. Once we move to Arc K8s Monitoring extension Public preview, these will be taken care so at that point of time no manual changes like this required. +Ev2 pipeline used to deploy the chart of the Arc K8s Container Insights Extension as per Safe Deployment Process. +Here is the high level process +``` + 1. Specify chart version of the release candidate and trigger [container-insights-arc-k8s-extension-ci_prod-release](https://github-private.visualstudio.com/microsoft/_release?_a=releases&view=all) + 2. Get the approval from one of team member for the release + 3. Once the approved, release should be triggered automatically + 4. use `cimon-arck8s-eastus2euap` for validating latest release in canary region + 5. TBD - Notify vendor team for the validation on all Arc K8s supported platforms +``` ## Microsoft Charts Repo release for On-prem K8s diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json new file mode 100644 index 000000000..a8a99e9f6 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json @@ -0,0 +1,66 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutParameters.json", + "contentVersion": "1.0.0.0", + "wait": [ + { + "name": "waitSdpBakeTime", + "properties": { + "duration": "PT24H" + } + } + ], + "shellExtensions": [ + { + "name": "PushChartToACR", + "type": "ShellExtensionType", + "properties": { + "maxexecutiontime": "PT1H" + }, + "package": { + "reference": { + "path": "artifacts.tar.gz" + } + }, + "launch": { + "command": [ + "/bin/bash", + "pushChartToAcr.sh" + ], + "environmentVariables": [ + { + "name": "RELEASE_STAGE", + "value": "__RELEASE_STAGE__" + }, + { + "name": "ACR_APP_ID", + "reference": { + "provider": "AzureKeyVault", + "parameters": { + "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappid/e8f47bf7505741ebaf65a4db16ff9fa7" + } + }, + "asSecureValue": "true" + }, + { + "name": "ACR_APP_SECRET", + "reference": { + "provider": "AzureKeyVault", + "parameters": { + "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappsecret/8718afcdac114accb8b26f613cef1e1e" + } + }, + "asSecureValue": "true" + }, + { + "name": "ACR_NAME", + "value": "__ACR_NAME__" + }, + { + "name": "CHART_VERSION", + "value": "__CHART_VERSION__" + } + ] + } + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json new file mode 100644 index 000000000..cde103633 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Canary", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-Canary", + "actions": [ "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json new file mode 100644 index 000000000..1749296c8 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-FF", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-FF", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json new file mode 100644 index 000000000..50729b1ae --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod3", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-HighLoad", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json new file mode 100644 index 000000000..edd61f852 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod2", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-LightLoad", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json new file mode 100644 index 000000000..014f4b092 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-MC", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-MC", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json new file mode 100644 index 000000000..cd1befbc3 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod2", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-MediumLoad", + "actions": ["wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json new file mode 100644 index 000000000..48c99fce1 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Pilot", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-Pilot", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR"], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json new file mode 100644 index 000000000..516eba3e2 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json @@ -0,0 +1,125 @@ +{ + "$schema": "https://ev2schema.azure.net/schemas/2020-01-01/scopeBindings.json", + "contentVersion": "0.0.0.1", + "scopeBindings": [ + { + "scopeTagName": "Canary", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "Canary" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "Pilot", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "Pilot" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "LightLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MediumLow" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "MediumLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MediumHigh" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "HighLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "HighLoad" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "FF", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "FF" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "MC", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MC" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh new file mode 100644 index 000000000..520557592 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +export HELM_EXPERIMENTAL_OCI=1 +export MCR_NAME="mcr.microsoft.com" +# for prod-> stable and for test -> preview +export REPO_TYPE="stable" + +# repo paths for arc k8s extension roll-out +# canary region +export CANARY_REGION_REPO_PATH="azuremonitor/containerinsights/canary/${REPO_TYPE}/azuremonitor-containers" +# pilot region +export PILOT_REGION_REPO_PATH="azuremonitor/containerinsights/prod1/${REPO_TYPE}/azuremonitor-containers" +# light load regions +export LIGHT_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod2/${REPO_TYPE}/azuremonitor-containers" +# medium load regions +export MEDIUM_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod3/${REPO_TYPE}/azuremonitor-containers" +# high load regions +export HIGH_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod4/${REPO_TYPE}/azuremonitor-containers" +# FairFax regions +export FF_REGION_REPO_PATH="azuremonitor/containerinsights/prod5/${REPO_TYPE}/azuremonitor-containers" +# Mooncake regions +export MC_REGION_REPO_PATH="azuremonitor/containerinsights/prod6/${REPO_TYPE}/azuremonitor-containers" + +# pull chart from previous stage mcr and push chart to next stage acr +pull_chart_from_source_mcr_to_push_to_dest_acr() { + srcMcrFullPath=${1} + destAcrFullPath=${2} + + if [ -z $srcMcrFullPath ]; then + echo "-e error source mcr path must be provided " + exit 1 + fi + + if [ -z $destAcrFullPath ]; then + echo "-e error dest acr path must be provided " + exit 1 + fi + + echo "Pulling chart from MCR:${srcMcrFullPath} ..." + helm chart pull ${srcMcrFullPath} + if [ $? -eq 0 ]; then + echo "Pulling chart from MCR:${srcMcrFullPath} completed successfully." + else + echo "-e error Pulling chart from MCR:${srcMcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "Exporting chart to current directory ..." + helm chart export ${srcMcrFullPath} + if [ $? -eq 0 ]; then + echo "Exporting chart to current directory completed successfully." + else + echo "-e error Exporting chart to current directory failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "save the chart locally with dest acr full path : ${destAcrFullPath} ..." + helm chart save azuremonitor-containers/ ${destAcrFullPath} + if [ $? -eq 0 ]; then + echo "save the chart locally with dest acr full path : ${destAcrFullPath} completed successfully." + else + echo "-e error save the chart locally with dest acr full path : ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "pushing the chart to acr path: ${destAcrFullPath} ..." + helm chart push ${destAcrFullPath} + if [ $? -eq 0 ]; then + echo "pushing the chart to acr path: ${destAcrFullPath} completed successfully." + else + echo "-e error pushing the chart to acr path: ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi +} + +# push to local release candidate chart to canary region +push_local_chart_to_canary_region() { + destAcrFullPath=${1} + if [ -z $destAcrFullPath ]; then + echo "-e error dest acr path must be provided " + exit 1 + fi + + echo "save the chart locally with dest acr full path : ${destAcrFullPath} ..." + helm chart save charts/azuremonitor-containers/ $destAcrFullPath + if [ $? -eq 0 ]; then + echo "save the chart locally with dest acr full path : ${destAcrFullPath} completed." + else + echo "-e error save the chart locally with dest acr full path : ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "pushing the chart to acr path: ${destAcrFullPath} ..." + helm chart push $destAcrFullPath + if [ $? -eq 0 ]; then + echo "pushing the chart to acr path: ${destAcrFullPath} completed successfully." + else + echo "-e error pushing the chart to acr path: ${destAcrFullPath} failed.Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi +} + +echo "START - Release stage : ${RELEASE_STAGE}" + +# login to acr +echo "Using acr : ${ACR_NAME}" +echo "Using acr repo type: ${REPO_TYPE}" + +echo "login to acr:${ACR_NAME} using helm ..." +echo $ACR_APP_SECRET | helm registry login $ACR_NAME --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to acr:${ACR_NAME} using helm completed successfully." +else + echo "-e error login to acr:${ACR_NAME} using helm failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 +fi + +case $RELEASE_STAGE in + + Canary) + echo "START: Release stage - Canary" + destAcrFullPath=${ACR_NAME}/public/${CANARY_REGION_REPO_PATH}:${CHART_VERSION} + push_local_chart_to_canary_region $destAcrFullPath + echo "END: Release stage - Canary" + ;; + + Pilot | Prod1) + echo "START: Release stage - Pilot" + srcMcrFullPath=${MCR_NAME}/${CANARY_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${PILOT_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Pilot" + ;; + + LightLoad | Pord2) + echo "START: Release stage - Light Load Regions" + srcMcrFullPath=${MCR_NAME}/${PILOT_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${LIGHT_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Light Load Regions" + ;; + + MediumLoad | Prod3) + echo "START: Release stage - Medium Load Regions" + srcMcrFullPath=${MCR_NAME}/${LIGHT_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${MEDIUM_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Medium Load Regions" + ;; + + HighLoad | Prod4) + echo "START: Release stage - High Load Regions" + srcMcrFullPath=${MCR_NAME}/${MEDIUM_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${HIGH_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - High Load Regions" + ;; + + FF | Prod5) + echo "START: Release stage - FF" + srcMcrFullPath=${MCR_NAME}/${HIGH_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${FF_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - FF" + ;; + + MC | Prod6) + echo "START: Release stage - MC" + srcMcrFullPath=${MCR_NAME}/${FF_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${MC_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - MC" + ;; + + *) + echo -n "unknown release stage" + exit 1 + ;; +esac + +echo "END - Release stage : ${RELEASE_STAGE}" diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json new file mode 100644 index 000000000..71081661a --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json @@ -0,0 +1,159 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/ServiceModel.json", + "ContentVersion": "0.0.0.1", + "ServiceMetadata": { + "ServiceGroup": "ContainerInsightsExtension", + "Environment": "Prod" + }, + "ServiceResourceGroupDefinitions": [ + { + "Name": "ARC-Extension-ServiceResourceGroupDefinition", + "ServiceResourceDefinitions": [ + { + "Name": "ShellExtension", + "ComposedOf": { + "Extension": { + "Shell": [ + { + "type": "ShellExtensionType", + "properties": { + "imageName": "adm-ubuntu-1804-l", + "imageVersion": "v18" + } + } + ] + } + } + } + ] + } + ], + "ServiceResourceGroups": [ + { + "AzureResourceGroupName": "ContainerInsightsExtension-Canary-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "Canary" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-Canary", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-Pilot-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "Pilot" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-Pilot", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-LightLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "LightLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-LightLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-MediumLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "MediumLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-MediumLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-HighLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "HighLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-HighLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-FF-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "FF" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-FF", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-MC-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "MC" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-MC", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + } + ] + } diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt b/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt new file mode 100644 index 000000000..1921233b3 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt @@ -0,0 +1 @@ +1.0.0.0 From 97678b679a9467350b64060b97bf3d355fc64874 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Fri, 9 Apr 2021 14:29:00 -0700 Subject: [PATCH 088/194] added liveness and telemetry for telegraf (#517) * added liveness and telemetry for telegraf * code transfer * removed windows liveness probe * done --- .../installer/conf/td-agent-bit-prom-side-car.conf | 12 ++++++++++++ build/linux/installer/conf/td-agent-bit-rs.conf | 12 ++++++++++++ build/linux/installer/conf/td-agent-bit.conf | 12 ++++++++++++ build/linux/installer/scripts/livenessprobe.sh | 9 +++++++++ kubernetes/linux/main.sh | 4 ++++ 5 files changed, 49 insertions(+) diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 720f54820..339e509b0 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -11,6 +11,18 @@ Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/build/linux/installer/conf/td-agent-bit-rs.conf b/build/linux/installer/conf/td-agent-bit-rs.conf index 696ac80e6..c94b4c40e 100644 --- a/build/linux/installer/conf/td-agent-bit-rs.conf +++ b/build/linux/installer/conf/td-agent-bit-rs.conf @@ -10,6 +10,18 @@ Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/build/linux/installer/conf/td-agent-bit.conf b/build/linux/installer/conf/td-agent-bit.conf index 484a4bbbf..287c076dc 100644 --- a/build/linux/installer/conf/td-agent-bit.conf +++ b/build/linux/installer/conf/td-agent-bit.conf @@ -52,6 +52,18 @@ Skip_Long_Lines On Ignore_Older 2m +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index a82fa28eb..e3b0fc28e 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -26,6 +26,15 @@ then exit 1 fi +#test to exit non zero value if telegraf is not running +(ps -ef | grep telegraf | grep -v "grep") +if [ $? -ne 0 ] +then + echo "Telegraf is not running" > /dev/termination-log + echo "Telegraf is not running (controller: ${CONTROLLER_TYPE}, container type: ${CONTAINER_TYPE})" > /dev/write-to-traces # this file is tailed and sent to traces + exit 1 +fi + if [ -s "inotifyoutput.txt" ] then # inotifyoutput file has data(config map was applied) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 71e46875b..81db6f3a4 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -699,6 +699,10 @@ dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' +# Write messages from the liveness probe to stdout (so telemetry picks it up) +touch /dev/write-to-traces + + echo "stopping rsyslog..." service rsyslog stop From 63ea896b7b7c270320678289eb0468690c1e24bd Mon Sep 17 00:00:00 2001 From: David Michelman Date: Tue, 13 Apr 2021 12:48:00 -0700 Subject: [PATCH 089/194] Windows metric fix (#530) * changes * about to remove container fix * moved caching code to existing loop * removed un-necessary changes * removed a few more un-necessary changes * added windows node check * fixed a bug * everything works confirmed --- source/plugins/ruby/filter_cadvisor2mdm.rb | 42 +++++++--- source/plugins/ruby/in_kube_nodes.rb | 92 ++++++++++++++++++++++ 2 files changed, 125 insertions(+), 9 deletions(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 8d7e729c8..659e3000c 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -9,6 +9,7 @@ module Fluent require_relative "CustomMetricsUtils" require_relative "kubelet_utils" require_relative "MdmMetricsGenerator" + require_relative "in_kube_nodes" class CAdvisor2MdmFilter < Filter Fluent::Plugin.register_filter("filter_cadvisor2mdm", self) @@ -23,6 +24,7 @@ class CAdvisor2MdmFilter < Filter @metrics_to_collect_hash = {} @@metric_threshold_hash = {} + @@controller_type = "" def initialize super @@ -63,6 +65,7 @@ def start @containerResourceDimensionHash = {} @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds + @NodeCache = Fluent::NodeStatsCache.new() end rescue => e @log.info "Error initializing plugin #{e}" @@ -161,19 +164,40 @@ def filter(tag, time, record) if counter_name == Constants::CPU_USAGE_NANO_CORES metric_name = Constants::CPU_USAGE_MILLI_CORES metric_value /= 1000000 #cadvisor record is in nanocores. Convert to mc - @log.info "Metric_value: #{metric_value} CPU Capacity #{@cpu_capacity}" - if @cpu_capacity != 0.0 - percentage_metric_value = (metric_value) * 100 / @cpu_capacity + if @@controller_type.downcase == "replicaset" + target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["DataItems"][0]["Host"]) / 1000000 + else + target_node_cpu_capacity_mc = @cpu_capacity + end + @log.info "Metric_value: #{metric_value} CPU Capacity #{target_node_cpu_capacity_mc}" + if target_node_cpu_capacity_mc != 0.0 + percentage_metric_value = (metric_value) * 100 / target_node_cpu_capacity_mc end end if counter_name.start_with?("memory") metric_name = counter_name - if @memory_capacity != 0.0 - percentage_metric_value = metric_value * 100 / @memory_capacity + if @@controller_type.downcase == "replicaset" + target_node_mem_capacity = @NodeCache.mem.get_capacity(record["DataItems"][0]["Host"]) + else + target_node_mem_capacity = @memory_capacity + end + @log.info "Metric_value: #{metric_value} Memory Capacity #{target_node_mem_capacity}" + if target_node_mem_capacity != 0.0 + percentage_metric_value = metric_value * 100 / target_node_mem_capacity end + end + @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["DataItems"][0]["Host"]} percentage: #{percentage_metric_value}" + + # do some sanity checking. Do we want this? + if percentage_metric_value > 100.0 or percentage_metric_value < 0.0 + telemetryProperties = {} + telemetryProperties["Computer"] = record["DataItems"][0]["Host"] + telemetryProperties["MetricName"] = metric_name + telemetryProperties["MetricPercentageValue"] = percentage_metric_value + ApplicationInsightsUtility.sendCustomEvent("ErrorPercentageOutOfBounds", telemetryProperties) end - # return get_metric_records(record, metric_name, metric_value, percentage_metric_value) + return MdmMetricsGenerator.getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value) elsif object_name == Constants::OBJECT_NAME_K8S_CONTAINER && @metrics_to_collect_hash.key?(counter_name.downcase) instanceName = record["DataItems"][0]["InstanceName"] @@ -279,8 +303,8 @@ def ensure_cpu_memory_capacity_set return end - controller_type = ENV["CONTROLLER_TYPE"] - if controller_type.downcase == "replicaset" + @@controller_type = ENV["CONTROLLER_TYPE"] + if @@controller_type.downcase == "replicaset" @log.info "ensure_cpu_memory_capacity_set @cpu_capacity #{@cpu_capacity} @memory_capacity #{@memory_capacity}" begin @@ -306,7 +330,7 @@ def ensure_cpu_memory_capacity_set @log.info "Error getting memory_capacity" end end - elsif controller_type.downcase == "daemonset" + elsif @@controller_type.downcase == "daemonset" capacity_from_kubelet = KubeletUtils.get_node_capacity # Error handling in case /metrics/cadvsior endpoint fails diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index d4b54c340..99e804302 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -43,6 +43,8 @@ def initialize @nodeInventoryE2EProcessingLatencyMs = 0 @nodesAPIE2ELatencyMs = 0 require_relative "constants" + + @NodeCache = NodeStatsCache.new() end config_param :run_interval, :time, :default => 60 @@ -197,6 +199,15 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end end + # Only CPU and Memory capacity for windows nodes get added to the cache (at end of file) + is_windows_node = false + if !item["status"].nil? && !item["status"]["nodeInfo"].nil? && !item["status"]["nodeInfo"]["operatingSystem"].nil? + operatingSystem = item["status"]["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + is_windows_node = true + end + end + # node metrics records nodeMetricRecords = [] nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) @@ -210,10 +221,18 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) + # add data to the cache so filter_cadvisor2mdm.rb can use it + if is_windows_node + @NodeCache.cpu.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + end end nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) + # add data to the cache so filter_cadvisor2mdm.rb can use it + if is_windows_node + @NodeCache.mem.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + end end nodeMetricRecords.each do |metricRecord| metricRecord["DataType"] = "LINUX_PERF_BLOB" @@ -496,4 +515,77 @@ def getNodeTelemetryProps(item) return properties end end # Kube_Node_Input + + + class NodeStatsCache + # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) + # (to reduce code duplication) + class NodeCache + + @@RECORD_TIME_TO_LIVE = 60*20 # units are seconds, so clear the cache every 20 minutes. + + def initialize + @cacheHash = {} + @timeAdded = {} # records when an entry was last added + @lock = Mutex.new + @lastCacheClearTime = 0 + + @cacheHash.default = 0.0 + @lastCacheClearTime = DateTime.now.to_time.to_i + end + + def get_capacity(node_name) + @lock.synchronize do + retval = @cacheHash[node_name] + return retval + end + end + + def set_capacity(host, val) + # check here if the cache has not been cleaned in a while. This way calling code doesn't have to remember to clean the cache + current_time = DateTime.now.to_time.to_i + if current_time - @lastCacheClearTime > @@RECORD_TIME_TO_LIVE + clean_cache + @lastCacheClearTime = current_time + end + + @lock.synchronize do + @cacheHash[host] = val + @timeAdded[host] = current_time + end + end + + def clean_cache() + $log.info "in_kube_nodes::clean_cache: cleaning node cpu/mem cache" + cacheClearTime = DateTime.now.to_time.to_i + @lock.synchronize do + nodes_to_remove = [] # first make a list of nodes to remove, then remove them. This intermediate + # list is used so that we aren't modifying a hash while iterating through it. + @cacheHash.each do |key, val| + if cacheClearTime - @timeAdded[key] > @@RECORD_TIME_TO_LIVE + nodes_to_remove.append(key) + end + end + + nodes_to_remove.each do node_name + @cacheHash.delete(node_name) + @timeAdded.delete(node_name) + end + end + end + end # NodeCache + + + @@cpuCache = NodeCache.new + @@memCache = NodeCache.new + + def cpu() + return @@cpuCache + end + + def mem() + return @@memCache + end + end + end # module From 42730a47a852d2299a0c995a602980385e52598b Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 13 Apr 2021 13:50:25 -0700 Subject: [PATCH 090/194] OSM doc update (#533) --- Documentation/OSMPrivatePreview/ReadMe.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/OSMPrivatePreview/ReadMe.md b/Documentation/OSMPrivatePreview/ReadMe.md index 1becd80b5..aa90c7413 100644 --- a/Documentation/OSMPrivatePreview/ReadMe.md +++ b/Documentation/OSMPrivatePreview/ReadMe.md @@ -1,3 +1,5 @@ +Note - This is private preview. For any support issues, please reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com). Please don't open a support ticket. + # Azure Monitor Container Insights Open Service Mesh Monitoring Azure Monitor container insights now supporting preview of [Open Service Mesh(OSM)](https://docs.microsoft.com/azure/aks/servicemesh-osm-about) Monitoring. As part of this support, customer can: @@ -64,5 +66,6 @@ InsightsMetrics ### Known Issues 1. The workbook has scale limits of 50 pods per namespace. If you have more than 50 pods in mesh you can have workbook loading issues. 2. When source or destination is osmcontroller we show no latency & for internal services we show no resource utilization. +3. When both prometheus scraping using pod annotations and OSM monitoring are enabled on the same set of namespaces, the default set of metrics (envoy_cluster_upstream_cx_total, envoy_cluster_upstream_cx_connect_fail, envoy_cluster_upstream_rq, envoy_cluster_upstream_rq_xx, envoy_cluster_upstream_rq_total, envoy_cluster_upstream_rq_time_bucket, envoy_cluster_upstream_cx_rx_bytes_total, envoy_cluster_upstream_cx_tx_bytes_total, envoy_cluster_upstream_cx_active) will be collected twice. You can follow [this](https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-prometheus-integration#prometheus-scraping-settings) documentation to exclude these namespaces from pod annotation scraping using the setting monitor_kubernetes_pods_namespaces to work around this issue. This is private preview, the goal for us is to get feedback. Please feel free to reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com) for any feedback and questions! From 7ad52cdb7f15a94abf78927ce0a6969965361af4 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 14 Apr 2021 10:28:27 -0700 Subject: [PATCH 091/194] Adding MDM metrics for threshold violation (#531) --- source/plugins/ruby/MdmAlertTemplates.rb | 67 +++++++++++- source/plugins/ruby/MdmMetricsGenerator.rb | 119 +++++++++++++++------ source/plugins/ruby/constants.rb | 10 +- 3 files changed, 161 insertions(+), 35 deletions(-) diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index ef63cf219..f2b713ff6 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -28,7 +28,7 @@ class MdmAlertTemplates } }' - Stable_job_metrics_template = ' + Stable_job_metrics_template = ' { "time": "%{timestamp}", "data": { @@ -90,6 +90,39 @@ class MdmAlertTemplates } }' + Container_resource_threshold_violation_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/containers", + "dimNames": [ + "containerName", + "podName", + "controllerName", + "Kubernetes namespace", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{containerNameDimValue}", + "%{podNameDimValue}", + "%{controllerNameDimValue}", + "%{namespaceDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{containerResourceThresholdViolated}, + "max": %{containerResourceThresholdViolated}, + "sum": %{containerResourceThresholdViolated}, + "count": 1 + } + ] + } + } + }' + PV_resource_utilization_template = ' { "time": "%{timestamp}", @@ -123,6 +156,38 @@ class MdmAlertTemplates } }' + PV_resource_threshold_violation_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolumes", + "dimNames": [ + "podName", + "node", + "kubernetesNamespace", + "volumeName", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{volumeNameDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{pvResourceThresholdViolated}, + "max": %{pvResourceThresholdViolated}, + "sum": %{pvResourceThresholdViolated}, + "count": 1 + } + ] + } + } + }' Node_resource_metrics_template = ' { diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 12d462e44..8703f43a7 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -39,10 +39,21 @@ class MdmMetricsGenerator Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, } + @@container_metric_name_metric_threshold_violated_hash = { + Constants::CPU_USAGE_MILLI_CORES => Constants::MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC, + Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC, + Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_THRESHOLD_VIOLATED_METRIC, + Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_THRESHOLD_VIOLATED_METRIC, + } + @@pod_metric_name_metric_percentage_name_hash = { Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC, } + @@pod_metric_name_metric_threshold_violated_hash = { + Constants::PV_USED_BYTES => Constants::MDM_PV_THRESHOLD_VIOLATED_METRIC, + } + # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @sendZeroFilledMetrics = true @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i @@ -158,43 +169,63 @@ def zeroFillMetricRecords(records, batch_time) metric_threshold_hash = getContainerResourceUtilizationThresholds container_zero_fill_dims = [Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL].join("~~") - containerCpuRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::CPU_USAGE_NANO_CORES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES]) - if !containerCpuRecord.nil? && !containerCpuRecord.empty? && !containerCpuRecord[0].nil? && !containerCpuRecord[0].empty? - records.push(containerCpuRecord[0]) + containerCpuRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::CPU_USAGE_NANO_CORES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES], + true) + if !containerCpuRecords.nil? && !containerCpuRecords.empty? + containerCpuRecords.each { |cpuRecord| + if !cpuRecord.nil? && !cpuRecord.empty? + records.push(cpuRecord) + end + } end - containerMemoryRssRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::MEMORY_RSS_BYTES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::MEMORY_RSS_BYTES]) - if !containerMemoryRssRecord.nil? && !containerMemoryRssRecord.empty? && !containerMemoryRssRecord[0].nil? && !containerMemoryRssRecord[0].empty? - records.push(containerMemoryRssRecord[0]) + containerMemoryRssRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::MEMORY_RSS_BYTES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::MEMORY_RSS_BYTES], + true) + if !containerMemoryRssRecords.nil? && !containerMemoryRssRecords.empty? + containerMemoryRssRecords.each { |memoryRssRecord| + if !memoryRssRecord.nil? && !memoryRssRecord.empty? + records.push(memoryRssRecord) + end + } end - containerMemoryWorkingSetRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::MEMORY_WORKING_SET_BYTES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES]) - if !containerMemoryWorkingSetRecord.nil? && !containerMemoryWorkingSetRecord.empty? && !containerMemoryWorkingSetRecord[0].nil? && !containerMemoryWorkingSetRecord[0].empty? - records.push(containerMemoryWorkingSetRecord[0]) + containerMemoryWorkingSetRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::MEMORY_WORKING_SET_BYTES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES], + true) + if !containerMemoryWorkingSetRecords.nil? && !containerMemoryWorkingSetRecords.empty? + containerMemoryWorkingSetRecords.each { |workingSetRecord| + if !workingSetRecord.nil? && !workingSetRecord.empty? + records.push(workingSetRecord) + end + } end pvZeroFillDims = {} pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = Constants::VOLUME_NAME_ZERO_FILL - pvResourceUtilMetricRecord = getPVResourceUtilMetricRecords(batch_time, - Constants::PV_USED_BYTES, - @@hostName, - 0, - pvZeroFillDims, - metric_threshold_hash[Constants::PV_USED_BYTES]) - if !pvResourceUtilMetricRecord.nil? && !pvResourceUtilMetricRecord.empty? && !pvResourceUtilMetricRecord[0].nil? && !pvResourceUtilMetricRecord[0].empty? - records.push(pvResourceUtilMetricRecord[0]) + pvResourceUtilMetricRecords = getPVResourceUtilMetricRecords(batch_time, + Constants::PV_USED_BYTES, + @@hostName, + 0, + pvZeroFillDims, + metric_threshold_hash[Constants::PV_USED_BYTES], + true) + if !pvResourceUtilMetricRecords.nil? && !pvResourceUtilMetricRecords.empty? + pvResourceUtilMetricRecords.each { |pvRecord| + if !pvRecord.nil? && !pvRecord.empty? + records.push(pvRecord) + end + } end rescue => errorStr @log.info "Error in zeroFillMetricRecords: #{errorStr}" @@ -247,7 +278,7 @@ def appendAllPodMetrics(records, batch_time) return records end - def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) + def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage, isZeroFill = false) records = [] begin if dims.nil? @@ -276,6 +307,19 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag thresholdPercentageDimValue: thresholdPercentage, } records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + + # Adding another metric for threshold violation + resourceThresholdViolatedRecord = MdmAlertTemplates::Container_resource_threshold_violation_template % { + timestamp: recordTimeStamp, + metricName: @@container_metric_name_metric_threshold_violated_hash[metricName], + containerNameDimValue: containerName, + podNameDimValue: podName, + controllerNameDimValue: controllerName, + namespaceDimValue: podNamespace, + containerResourceThresholdViolated: isZeroFill ? 0 : 1, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceThresholdViolatedRecord))) rescue => errorStr @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -283,7 +327,7 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end - def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage) + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage, isZeroFill = false) records = [] begin containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] @@ -303,6 +347,19 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen thresholdPercentageDimValue: thresholdPercentage, } records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + + # Adding another metric for threshold violation + resourceThresholdViolatedRecord = MdmAlertTemplates::PV_resource_threshold_violation_template % { + timestamp: recordTimeStamp, + metricName: @@pod_metric_name_metric_threshold_violated_hash[metricName], + podNameDimValue: podName, + computerNameDimValue: computer, + namespaceDimValue: pvcNamespace, + volumeNameDimValue: volumeName, + pvResourceThresholdViolated: isZeroFill ? 0 : 1, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceThresholdViolatedRecord))) rescue => errorStr @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index cf41900dc..e0b0d1e0c 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -53,6 +53,10 @@ class Constants MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" + MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC = "cpuThresholdViolated" + MDM_CONTAINER_MEMORY_RSS_THRESHOLD_VIOLATED_METRIC = "memoryRssThresholdViolated" + MDM_CONTAINER_MEMORY_WORKING_SET_THRESHOLD_VIOLATED_METRIC = "memoryWorkingSetThresholdViolated" + MDM_PV_THRESHOLD_VIOLATED_METRIC = "pvUsageThresholdViolated" MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" @@ -77,9 +81,9 @@ class Constants OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" VOLUME_NAME_ZERO_FILL = "-" - PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", - "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", - "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] + PV_TYPES = ["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", + "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" From 34d1f64f89dd07168680aed955cbf5dfbe467885 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 20 Apr 2021 17:51:23 -0700 Subject: [PATCH 092/194] Rashmi/april agent 2021 (#538) --- .../scripts/tomlparser-mdm-metrics-config.rb | 21 ++++++++++ kubernetes/container-azm-ms-agentconfig.yaml | 5 +++ source/plugins/ruby/KubernetesApiClient.rb | 34 ++++++++++++++++- source/plugins/ruby/MdmAlertTemplates.rb | 2 +- source/plugins/ruby/MdmMetricsGenerator.rb | 38 +++++++++++++++---- source/plugins/ruby/constants.rb | 3 +- source/plugins/ruby/podinventory_to_mdm.rb | 3 +- 7 files changed, 95 insertions(+), 11 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 345c51633..5ce5d79d2 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -13,6 +13,7 @@ @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD +@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -101,6 +102,25 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end + + # Get mdm metrics config settings for job completion + begin + jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold] + if !jobCompletion.nil? + jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes] + jobCompletionThresholdInt = jobCompletionThreshold.to_i + if jobCompletionThresholdInt.kind_of? Integer + @jobCompletionThresholdMinutes = jobCompletionThresholdInt + else + puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default " + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end + puts "config::Using config map settings for MDM metric configuration settings for job completion" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors") + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end end end @@ -125,6 +145,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") + file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index e38d9b4ab..543f270c1 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -126,6 +126,11 @@ data: [alertable_metrics_configuration_settings.pv_utilization_thresholds] # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage pv_usage_threshold_percentage = 60.0 + + # Alertable metrics configuration settings for completed jobs count + [alertable_metrics_configuration_settings.job_completion_threshold] + # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold + job_completion_threshold_time_minutes = 360 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index c5a363741..98347d272 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -31,6 +31,8 @@ class KubernetesApiClient @@TokenStr = nil @@NodeMetrics = Hash.new @@WinNodeArray = [] + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} def initialize end @@ -403,9 +405,12 @@ def getPodUid(podNameSpace, podMetadata) def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 begin clusterId = getClusterId podNameSpace = pod["metadata"]["namespace"] + podName = pod["metadata"]["name"] podUid = getPodUid(podNameSpace, pod["metadata"]) if podUid.nil? return metricItems @@ -456,6 +461,33 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + #Telemetry about omsagent requests and limits + begin + if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) + nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~") + @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue + end + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@resourceLimitsTelemetryHash.each { |key, value| + keyElements = key.split("~~") + if keyElements.length != 4 + next + end + + # get dimension values by key + telemetryProps = {} + telemetryProps["Computer"] = keyElements[0] + telemetryProps["PodName"] = keyElements[1] + telemetryProps["ContainerName"] = keyElements[2] + metricNameFromKey = keyElements[3] + ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps) + } + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") + end #No container level limit for the given metric, so default to node level limit else nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect @@ -791,7 +823,7 @@ def getKubeAPIServerUrl def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) kubeServiceRecords = [] begin - if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? ) + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?) servicesCount = serviceList["items"].length @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") serviceList["items"].each do |item| diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index f2b713ff6..e889c3f09 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -45,7 +45,7 @@ class MdmAlertTemplates "dimValues": [ "%{controllerNameDimValue}", "%{namespaceDimValue}", - "6" + "%{jobCompletionThreshold}" ], "min": %{containerCountMetricValue}, "max": %{containerCountMetricValue}, diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 8703f43a7..f2aa92c14 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -107,13 +107,28 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat podControllerNameDimValue = key_elements[0] podNamespaceDimValue = key_elements[1] - record = metricsTemplate % { - timestamp: batch_time, - metricName: metricName, - controllerNameDimValue: podControllerNameDimValue, - namespaceDimValue: podNamespaceDimValue, - containerCountMetricValue: value, - } + # Special handling for jobs since we need to send the threshold as a dimension as it is configurable + if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT + metric_threshold_hash = getContainerResourceUtilizationThresholds + #Converting this to hours since we already have olderThanHours dimension. + jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0 + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + jobCompletionThreshold: jobCompletionThresholdHours, + } + else + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + } + end records.push(Yajl::Parser.parse(StringIO.new(record))) } else @@ -140,9 +155,11 @@ def flushPodMdmMetricTelemetry staleJobHashValues = @stale_job_count_hash.values staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x } + metric_threshold_hash = getContainerResourceUtilizationThresholds properties["ContainerRestarts"] = containerRestartMetricCount properties["OomKilledContainers"] = oomKilledContainerMetricCount properties["OldCompletedJobs"] = staleJobMetricCount + properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties) ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {}) rescue => errorStr @@ -465,6 +482,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -490,6 +508,12 @@ def getContainerResourceUtilizationThresholds pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat end + + jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"] + if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty? + jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index e0b0d1e0c..906019b95 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -69,14 +69,15 @@ class Constants MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" PV_USED_BYTES = "pvUsedBytes" + JOB_COMPLETION_TIME = "completedJobTimeMinutes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 + DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" - STALE_JOB_TIME_IN_MINUTES = 360 TELEGRAF_DISK_METRICS = "container.azm.ms/disk" OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index 77370e284..d9cb71bd4 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -88,6 +88,7 @@ def initialize() @pod_count_by_phase = {} @pod_uids = {} @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability + @metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @log.debug { "Starting podinventory_to_mdm plugin" } end @@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames if !containerFinishedTime.nil? && !containerFinishedTime.empty? finishedTimeParsed = Time.parse(containerFinishedTime) # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES - if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES + if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME] MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue, podNamespaceDimValue) end From fcc50480ce1c56a14657bf75c54609340e1c23e2 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 21 Apr 2021 09:57:02 -0700 Subject: [PATCH 093/194] add Read_from_Head config for all fluentbit tail plugins (#539) See the commit message of: fluent/fluent-bit@70e33fa for details explaining the fluentbit change and what Read_from_Head does when set to true. --- build/linux/installer/conf/td-agent-bit-prom-side-car.conf | 1 + build/linux/installer/conf/td-agent-bit-rs.conf | 1 + build/linux/installer/conf/td-agent-bit.conf | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 339e509b0..05fa3afd2 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -15,6 +15,7 @@ Name tail Tag oms.container.log.flbplugin.terminationlog.* Path /dev/write-to-traces + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db DB.Sync Off Parser docker diff --git a/build/linux/installer/conf/td-agent-bit-rs.conf b/build/linux/installer/conf/td-agent-bit-rs.conf index c94b4c40e..9613c270d 100644 --- a/build/linux/installer/conf/td-agent-bit-rs.conf +++ b/build/linux/installer/conf/td-agent-bit-rs.conf @@ -14,6 +14,7 @@ Name tail Tag oms.container.log.flbplugin.terminationlog.* Path /dev/write-to-traces + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db DB.Sync Off Parser docker diff --git a/build/linux/installer/conf/td-agent-bit.conf b/build/linux/installer/conf/td-agent-bit.conf index 287c076dc..045aefcaf 100644 --- a/build/linux/installer/conf/td-agent-bit.conf +++ b/build/linux/installer/conf/td-agent-bit.conf @@ -15,6 +15,7 @@ Name tail Tag oms.container.log.la.* Path ${AZMON_LOG_TAIL_PATH} + Read_from_Head true DB /var/log/omsagent-fblogs.db DB.Sync Off Parser docker @@ -32,6 +33,7 @@ Name tail Tag oms.container.log.flbplugin.* Path /var/log/containers/omsagent*.log + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db DB.Sync Off Parser docker @@ -44,6 +46,7 @@ Name tail Tag oms.container.log.flbplugin.mdsd.* Path /var/opt/microsoft/linuxmonagent/log/mdsd.err + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/mdsd-ai.db DB.Sync Off Parser docker @@ -56,6 +59,7 @@ Name tail Tag oms.container.log.flbplugin.terminationlog.* Path /dev/write-to-traces + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db DB.Sync Off Parser docker From 01e5529bc8bead27d04607db7b087a0645d1e7db Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 21 Apr 2021 20:06:15 -0700 Subject: [PATCH 094/194] fix programdata mount issue on containerd win nodes (#542) --- .../templates/omsagent-daemonset-windows.yaml | 1 + kubernetes/omsagent.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 8868b86bb..580ef9d15 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -118,6 +118,7 @@ spec: - name: docker-windows-containers hostPath: path: C:\ProgramData\docker\containers + type: DirectoryOrCreate - name: settings-vol-config configMap: name: container-azm-ms-agentconfig diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 206d9a8f0..e98b8ace3 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -839,6 +839,7 @@ spec: - name: docker-windows-containers hostPath: path: C:\ProgramData\docker\containers + type: DirectoryOrCreate - name: settings-vol-config configMap: name: container-azm-ms-agentconfig From b5d074afecff2998c8171f8b3b10e25fc9b21ccf Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 22 Apr 2021 13:52:43 -0700 Subject: [PATCH 095/194] Update sidecar mem limits (#541) --- build/linux/installer/conf/td-agent-bit-prom-side-car.conf | 6 +++--- kubernetes/omsagent.yaml | 2 +- source/plugins/ruby/MdmMetricsGenerator.rb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 05fa3afd2..8a69f7995 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -29,9 +29,9 @@ Tag oms.container.perf.telegraf.* Listen 0.0.0.0 Port 25229 - Chunk_Size 1m - Buffer_Size 1m - Mem_Buf_Limit 20m + Chunk_Size 10m + Buffer_Size 10m + Mem_Buf_Limit 200m [OUTPUT] Name oms diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index e98b8ace3..fc3428a26 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -451,7 +451,7 @@ spec: resources: limits: cpu: 500m - memory: 400Mi + memory: 1Gi requests: cpu: 75m memory: 225Mi diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index f2aa92c14..6641456af 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -111,7 +111,7 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT metric_threshold_hash = getContainerResourceUtilizationThresholds #Converting this to hours since we already have olderThanHours dimension. - jobCompletionThresholdHours = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0 + jobCompletionThresholdHours = (metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0).round(2) record = metricsTemplate % { timestamp: batch_time, metricName: metricName, From 5feeb3e3e617f2ac286e7c222e5e8573c2543361 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Thu, 22 Apr 2021 14:51:51 -0700 Subject: [PATCH 096/194] David/release 4 22 2021 (#544) * updating image tag and agent version * updated liveness probe * updated release notes again * fixed date in version file --- ReleaseNotes.md | 17 +++++++++++++++++ build/version | 4 ++-- charts/azuremonitor-containers/Chart.yaml | 2 +- charts/azuremonitor-containers/values.yaml | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 14 +++++++------- kubernetes/windows/Dockerfile | 2 +- .../onboarding/managed/enable-monitoring.ps1 | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- .../onboarding/managed/upgrade-monitoring.sh | 2 +- 10 files changed, 35 insertions(+), 18 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 04bd7c6e5..acbd579a0 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,23 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 04/22/2021 - +##### Version microsoft/oms:ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021 (linux) +##### Version microsoft/oms:win-ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021 (windows) +##### Code change log +- Bug fixes for metrics cpuUsagePercentage and memoryWorkingSetPercentage for windows nodes +- Added metrics for threshold violation +- Made Job completion metric configurable +- Udated default buffer sizes in fluent-bit +- Updated recommended alerts +- Fixed bug where logs written before agent starts up were not collected +- Fixed bug which kept agent logs from being rotated +- Bug fix for Windows Containerd container log collection +- Bug fixes +- Doc updates +- Minor telemetry changes + + ### 03/26/2021 - ##### Version microsoft/oms:ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021 (linux) ##### Version microsoft/oms:win-ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021 (windows) diff --git a/build/version b/build/version index 83a0a174b..16a43604a 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=14 +CONTAINER_BUILDVERSION_MAJOR=15 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210326 +CONTAINER_BUILDVERSION_DATE=20210422 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 9c8014ed0..00f3f49ed 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.8.2 +version: 2.8.3 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 4b539546b..9dd5317a4 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -21,10 +21,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod03262021" - tagWindows: "win-ciprod03262021" + tag: "ciprod04222021" + tagWindows: "win-ciprod04222021" pullPolicy: IfNotPresent - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" agentVersion: "1.10.0.1" # The priority used by the omsagent priority class for the daemonset pods diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 76b8622b4..d5ece4509 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod03262021 +ARG IMAGE_TAG=ciprod04222021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index fc3428a26..feea3f29a 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: @@ -446,7 +446,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: @@ -583,13 +583,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: @@ -750,7 +750,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -760,7 +760,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index e4ace417a..fefd089a8 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod03262021 +ARG IMAGE_TAG=win-ciprod04222021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index baf547497..828d061ac 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -64,7 +64,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.8.2" +$mcrChartVersion = "2.8.3" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." $omsAgentDomainName="opinsights.azure.com" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 9747d932d..f27f944fd 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -44,7 +44,7 @@ defaultAzureCloud="AzureCloud" omsAgentDomainName="opinsights.azure.com" # released chart version in mcr -mcrChartVersion="2.8.2" +mcrChartVersion="2.8.3" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 1cf7b5c97..5456a7072 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.8.2" +mcrChartVersion="2.8.3" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" From 1b2da4adb4a8beac41af1cf2fd093872a52c95c6 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Fri, 23 Apr 2021 09:29:10 -0700 Subject: [PATCH 097/194] 1m, 1m, 1s by default (#543) * 1m, 1m, 1s by default * setting default through a different method --- .../installer/scripts/td-agent-bit-conf-customizer.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb index 35b71e550..ea1536866 100644 --- a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb @@ -3,7 +3,9 @@ @td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" -@default_service_interval = "15" +@default_service_interval = "1" +@default_buffer_chunk_size = "1" +@default_buffer_max_size = "1" def is_number?(value) true if Integer(value) rescue false @@ -21,9 +23,9 @@ def substituteFluentBitPlaceHolders serviceInterval = (!interval.nil? && is_number?(interval) && interval.to_i > 0 ) ? interval : @default_service_interval serviceIntervalSetting = "Flush " + serviceInterval - tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize) && bufferChunkSize.to_i > 0) ? bufferChunkSize : nil + tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize) && bufferChunkSize.to_i > 0) ? bufferChunkSize : @default_buffer_chunk_size - tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize) && bufferMaxSize.to_i > 0) ? bufferMaxSize : nil + tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize) && bufferMaxSize.to_i > 0) ? bufferMaxSize : @default_buffer_max_size = "1" if ((!tailBufferChunkSize.nil? && tailBufferMaxSize.nil?) || (!tailBufferChunkSize.nil? && !tailBufferMaxSize.nil? && tailBufferChunkSize.to_i > tailBufferMaxSize.to_i)) puts "config:warn buffer max size must be greater or equal to chunk size" From 83e5816d6f3d92a6ff4dad32ac68694274714d23 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Wed, 12 May 2021 16:15:04 -0700 Subject: [PATCH 098/194] David/aad stage 1 release (#556) * update to latest omsagent, add eastus2 to mdsd regions * copied oneagent bits to a CI repository release * mdsd inmem mode * yaml for cl scale test * yaml for cl scale test * reverting dockerProviderVersion version to 15.0.0 * prepping for release (updated image version, dockerProviderVersion, and release notes * container log scaletest yamls * forgot to update image version in chart * fixing windows tag in dockerfile, changing release notes wording * missed windows tag in one more place * forgot to change the windows dockerProviderVersion back Co-authored-by: Ganga Mahesh Siddem --- ReleaseNotes.md | 9 ++- .../linux/installer/scripts/livenessprobe.sh | 2 +- build/version | 4 +- kubernetes/linux/Dockerfile | 2 +- kubernetes/linux/envmdsd | 4 ++ kubernetes/linux/main.sh | 2 +- kubernetes/linux/setup.sh | 4 +- kubernetes/omsagent.yaml | 12 ++-- .../400logspersec-2klogentrysize.yaml | 60 +++++++++++++++++++ .../400logspersec-5klogentrysize.yaml | 60 +++++++++++++++++++ .../ci-log-scale-4kpersec-5klogline.yaml | 60 +++++++++++++++++++ 11 files changed, 205 insertions(+), 14 deletions(-) create mode 100644 test/containerlog-scale-tests/400logspersec-2klogentrysize.yaml create mode 100644 test/containerlog-scale-tests/400logspersec-5klogentrysize.yaml create mode 100644 test/containerlog-scale-tests/ci-log-scale-4kpersec-5klogline.yaml diff --git a/ReleaseNotes.md b/ReleaseNotes.md index acbd579a0..979eb968b 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,14 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) + +### 05/12/2021 - +##### Version microsoft/oms:ciprod00512021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod00512021 (linux) +##### No Windows changes with this release, win-ciprod04222021 still current. +##### Code change log +- Upgrading oneagent to version 1.8 (only for Linux) +- Enabling oneagent for container logs for East US 2 + ### 04/22/2021 - ##### Version microsoft/oms:ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021 (linux) ##### Version microsoft/oms:win-ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021 (windows) @@ -27,7 +35,6 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Doc updates - Minor telemetry changes - ### 03/26/2021 - ##### Version microsoft/oms:ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021 (linux) ##### Version microsoft/oms:win-ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021 (windows) diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index e3b0fc28e..198b4e87f 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -10,7 +10,7 @@ fi #optionally test to exit non zero value if oneagent is not running if [ -e "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" ]; then - (ps -ef | grep "mdsd -l" | grep -v "grep") + (ps -ef | grep "mdsd" | grep -v "grep") if [ $? -ne 0 ] then echo "oneagent is not running" > /dev/termination-log diff --git a/build/version b/build/version index 16a43604a..81bb808f5 100644 --- a/build/version +++ b/build/version @@ -3,10 +3,10 @@ # Build Version Information CONTAINER_BUILDVERSION_MAJOR=15 -CONTAINER_BUILDVERSION_MINOR=0 +CONTAINER_BUILDVERSION_MINOR=1 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210422 +CONTAINER_BUILDVERSION_DATE=20210512 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index d5ece4509..822e52bc8 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod04222021 +ARG IMAGE_TAG=ciprod05122021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/linux/envmdsd b/kubernetes/linux/envmdsd index e4886012e..3f834bfb8 100644 --- a/kubernetes/linux/envmdsd +++ b/kubernetes/linux/envmdsd @@ -12,3 +12,7 @@ export HOSTNAME_OVERRIDE="${NODE_NAME}" export MDSD_TCMALLOC_RELEASE_FREQ_SEC=1 export MDSD_COMPRESSION_ALGORITHM=LZ4 export SSL_CERT_DIR="/etc/ssl/certs" +# increase the size of msgpack items mdsd will accept, otherwise they will be silently dropped. These values were arbitrairly chosen to be 10 or 100 times larger than the defaults. +export MDSD_MSGPACK_ARRAY_SIZE_ITEMS=10000000 +export MDSD_MSGPACK_MAP_SIZE_ITEMS=10000000 +export MDSD_MSGPACK_NESTING_LEVEL=100 diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 81db6f3a4..f03318ad1 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -581,7 +581,7 @@ if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidec dpkg -l | grep mdsd | awk '{print $2 " " $3}' echo "starting mdsd ..." - mdsd -l -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 fi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index ee3756964..f065cc165 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -31,8 +31,8 @@ mv $TMPDIR/omsbundle* $TMPDIR/omsbundle /usr/bin/dpkg -i $TMPDIR/omsbundle/110/omsagent*.deb #/usr/bin/dpkg -i $TMPDIR/omsbundle/100/omsconfig*.deb -#install oneagent - Official bits (10/18) -wget https://github.com/microsoft/Docker-Provider/releases/download/10182020-oneagent/azure-mdsd_1.5.126-build.master.99_x86_64.deb +#install oneagent - Official bits (05/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/05112021-oneagent/azure-mdsd_1.8.0-build.master.189_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d cp -f $TMPDIR/envmdsd /etc/mdsd.d diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index feea3f29a..bf94490ba 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "15.0.0-0" + dockerProviderVersion: "15.1.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021" imagePullPolicy: IfNotPresent resources: limits: @@ -399,7 +399,7 @@ spec: - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" - name: AZMON_CONTAINERLOGS_ONEAGENT_REGIONS - value: "koreacentral,norwayeast" + value: "koreacentral,norwayeast,eastus2" securityContext: privileged: true ports: @@ -446,7 +446,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021" imagePullPolicy: IfNotPresent resources: limits: @@ -583,13 +583,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "15.0.0-0" + dockerProviderVersion: "15.1.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/test/containerlog-scale-tests/400logspersec-2klogentrysize.yaml b/test/containerlog-scale-tests/400logspersec-2klogentrysize.yaml new file mode 100644 index 000000000..cc3dd5259 --- /dev/null +++ b/test/containerlog-scale-tests/400logspersec-2klogentrysize.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: logs-400persec-2kentrysize +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: logs-400persec-2kentrysize + spec: + volumes: + - name: logs-400persec-2kentrysize-scripts-volume + configMap: + name: logs-400persec-test-scripts + containers: + - name: logs-400persec-2kentrysize + image: ubuntu + volumeMounts: + - mountPath: /logs-400persec-test-scripts + name: logs-400persec-2kentrysize-scripts-volume + env: + - name: HOME + value: /tmp + command: + - /bin/sh + - -c + - | + echo "scripts in /logs-400persec-test-scripts" + ls -lh /logs-400persec-test-scripts + echo "copy scripts to /tmp" + cp /logs-400persec-test-scripts/*.sh /tmp + echo "apply 'chmod +x' to /tmp/*.sh" + chmod +x /tmp/*.sh + echo "script.sh in /tmp" + ls -lh /tmp + /tmp/script.sh + restartPolicy: Never +--- +apiVersion: v1 +items: +- apiVersion: v1 + data: + script.sh: | + #!/bin/bash + logentry='' + for var in {1..400..1} + do + logentry="${logentry}Test-" + done + for var in {1..200000..1} + do + echo $(date "+%Y/%m/%d %H:%M:%S.%3N") ${var}: $logentry + done + kind: ConfigMap + metadata: + creationTimestamp: null + name: logs-400persec-test-scripts +kind: List +metadata: {} diff --git a/test/containerlog-scale-tests/400logspersec-5klogentrysize.yaml b/test/containerlog-scale-tests/400logspersec-5klogentrysize.yaml new file mode 100644 index 000000000..42188631a --- /dev/null +++ b/test/containerlog-scale-tests/400logspersec-5klogentrysize.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: logs-400persec-5kentrysize +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: logs-400persec-5kentrysize + spec: + volumes: + - name: logs-400persec-5kentrysize-scripts-volume + configMap: + name: logs-400persec-5kentrysize-test-scripts + containers: + - name: logs-400persec-5kentrysize + image: ubuntu + volumeMounts: + - mountPath: /logs-400persec-5kentrysize-test-scripts + name: logs-400persec-5kentrysize-scripts-volume + env: + - name: HOME + value: /tmp + command: + - /bin/sh + - -c + - | + echo "scripts in /logs-400persec-5kentrysize-test-scripts" + ls -lh /logs-400persec-5kentrysize-test-scripts + echo "copy scripts to /tmp" + cp /logs-400persec-5kentrysize-test-scripts/*.sh /tmp + echo "apply 'chmod +x' to /tmp/*.sh" + chmod +x /tmp/*.sh + echo "script.sh in /tmp" + ls -lh /tmp + /tmp/script.sh + restartPolicy: Never +--- +apiVersion: v1 +items: +- apiVersion: v1 + data: + script.sh: | + #!/bin/bash + logentry='' + for var in {1..1024..1} + do + logentry="${logentry}Test-" + done + for var in {1..200000..1} + do + echo $(date "+%Y/%m/%d %H:%M:%S.%3N") ${var}: $logentry + done + kind: ConfigMap + metadata: + creationTimestamp: null + name: logs-400persec-5kentrysize-test-scripts +kind: List +metadata: {} diff --git a/test/containerlog-scale-tests/ci-log-scale-4kpersec-5klogline.yaml b/test/containerlog-scale-tests/ci-log-scale-4kpersec-5klogline.yaml new file mode 100644 index 000000000..ff619a822 --- /dev/null +++ b/test/containerlog-scale-tests/ci-log-scale-4kpersec-5klogline.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: ci-log-scale +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: ci-log-scale + spec: + volumes: + - name: ci-log-scale-scripts-volume + configMap: + name: test-scripts + containers: + - name: ci-log-scale + image: ubuntu + volumeMounts: + - mountPath: /test-scripts + name: ci-log-scale-scripts-volume + env: + - name: HOME + value: /tmp + command: + - /bin/sh + - -c + - | + echo "scripts in /test-scripts" + ls -lh /test-scripts + echo "copy scripts to /tmp" + cp /test-scripts/*.sh /tmp + echo "apply 'chmod +x' to /tmp/*.sh" + chmod +x /tmp/*.sh + echo "script.sh in /tmp" + ls -lh /tmp + /tmp/script.sh + restartPolicy: Never +--- +apiVersion: v1 +items: +- apiVersion: v1 + data: + script.sh: | + #!/bin/bash + logentry='' + for var in {1..1024..1} + do + logentry="${logentry}Test-" + done + for var in {1..200000..1} + do + echo $(date "+%Y/%m/%d %H:%M:%S.%3N") ${var}: $logentry + done + kind: ConfigMap + metadata: + creationTimestamp: null + name: test-scripts +kind: List +metadata: {} From 8beabe3cef2fdc4a60e79d3866bf9d4e4723f0a2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 12 May 2021 17:16:21 -0700 Subject: [PATCH 099/194] Update ReleaseNotes.md (#558) fix imagetag in the release notes --- ReleaseNotes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 979eb968b..b4c0d6ba4 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -13,7 +13,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ### 05/12/2021 - -##### Version microsoft/oms:ciprod00512021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod00512021 (linux) +##### Version microsoft/oms:ciprod00512021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021 (linux) ##### No Windows changes with this release, win-ciprod04222021 still current. ##### Code change log - Upgrading oneagent to version 1.8 (only for Linux) From 3805f44d89abd9034756c41174da5f1ba58e9500 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Wed, 19 May 2021 07:09:19 -0700 Subject: [PATCH 100/194] Add wait time for telegraf and also force mdm egress to use tls 1.2 (#560) * Add wait time for telegraf and also force mdm egress to use tls 1.2 * add wait for all telegraf dependencies across all containers (ds & rs) * remove ssl change so we dont include as part of the other fix until we test with att nodes. --- kubernetes/linux/main.sh | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index f03318ad1..c7d939034 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -1,5 +1,43 @@ #!/bin/bash +waitforlisteneronTCPport() { + local sleepdurationsecs=1 + local totalsleptsecs=0 + local port=$1 + local waittimesecs=$2 + local numeric='^[0-9]+$' + local varlistener="" + + if [ -z "$1" ] || [ -z "$2" ]; then + echo "${FUNCNAME[0]} called with incorrect arguments<$1 , $2>. Required arguments <#port, #wait-time-in-seconds>" + return -1 + else + + if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then + #local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"') + while true + do + if [ $totalsleptsecs -gt $waittimesecs ]; then + echo "${FUNCNAME[0]} giving up waiting for listener on port:$port after $totalsleptsecs secs" + return 1 + fi + varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":'"$port"'$"') + if [ -z "$varlistener" ]; then + #echo "${FUNCNAME[0]} waiting for $sleepdurationsecs more sec for listener on port:$port ..." + sleep $sleepdurationsecs + totalsleptsecs=$(($totalsleptsecs+1)) + else + echo "${FUNCNAME[0]} found listener on port:$port in $totalsleptsecs secs" + return 0 + fi + done + else + echo "${FUNCNAME[0]} called with non-numeric arguments<$1 , $2>. Required arguments <#port, #wait-time-in-seconds>" + return -1 + fi + fi +} + if [ -e "/etc/config/kube.conf" ]; then cat /etc/config/kube.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf elif [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then @@ -689,6 +727,20 @@ echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc export HOST_VAR=/hostfs/var echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "checking for listener on tcp #25229 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25229 30 + else + echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25226 30 + echo "checking for listener on tcp #25228 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25228 30 + fi +else + echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25226 30 +fi #start telegraf /opt/telegraf --config $telegrafConfFile & From 7c5087f7f57d8b3e3a1f430ab3dfca19f8f888a9 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Wed, 19 May 2021 16:22:27 -0700 Subject: [PATCH 101/194] partially disabled telegraf liveness probe check, we'll still have telemetry but the probe won't fail if telegraf isn't running (#561) --- build/linux/installer/scripts/livenessprobe.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 198b4e87f..5e1261e7e 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -30,9 +30,9 @@ fi (ps -ef | grep telegraf | grep -v "grep") if [ $? -ne 0 ] then - echo "Telegraf is not running" > /dev/termination-log + # echo "Telegraf is not running" > /dev/termination-log echo "Telegraf is not running (controller: ${CONTROLLER_TYPE}, container type: ${CONTAINER_TYPE})" > /dev/write-to-traces # this file is tailed and sent to traces - exit 1 + # exit 1 fi if [ -s "inotifyoutput.txt" ] From 0d33489aaef5f0824b94c0b644ebe39c1501d576 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Thu, 20 May 2021 13:08:53 -0700 Subject: [PATCH 102/194] changes for 05202021 release (#563) * changes for 05202021 release * fixed typos --- ReleaseNotes.md | 8 +++++++- build/version | 2 +- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 10 +++++----- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index b4c0d6ba4..d7d6de6af 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,9 +11,15 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 05/20/2021 - +##### Version microsoft/oms:ciprod05202021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021 (linux) +##### No Windows changes with this release, win-ciprod04222021 still current. +##### Code change log +- Telegraf now waits 30 seconds on startup for network connections to complete (Linux only) +- Change adding telegraf to the liveness probe reverted (Linux only) ### 05/12/2021 - -##### Version microsoft/oms:ciprod00512021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021 (linux) +##### Version microsoft/oms:ciprod05122021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021 (linux) ##### No Windows changes with this release, win-ciprod04222021 still current. ##### Code change log - Upgrading oneagent to version 1.8 (only for Linux) diff --git a/build/version b/build/version index 81bb808f5..d70d1f9bc 100644 --- a/build/version +++ b/build/version @@ -3,7 +3,7 @@ # Build Version Information CONTAINER_BUILDVERSION_MAJOR=15 -CONTAINER_BUILDVERSION_MINOR=1 +CONTAINER_BUILDVERSION_MINOR=2 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 CONTAINER_BUILDVERSION_DATE=20210512 diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 822e52bc8..3ad3cd315 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod05122021 +ARG IMAGE_TAG=ciprod05202021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index bf94490ba..6ff02c941 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "15.1.0-0" + dockerProviderVersion: "15.2.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" imagePullPolicy: IfNotPresent resources: limits: @@ -446,7 +446,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" imagePullPolicy: IfNotPresent resources: limits: @@ -583,13 +583,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "15.1.0-0" + dockerProviderVersion: "15.2.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05122021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" imagePullPolicy: IfNotPresent resources: limits: From 486acfd719288196b192e751e1e644d2b054df0e Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 21 May 2021 11:42:26 -0700 Subject: [PATCH 103/194] Rashmi/jedi wireserver (#566) --- kubernetes/omsagent.yaml | 11 +++++++++ kubernetes/windows/main.ps1 | 49 ++++++++++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 6ff02c941..ab6bbea9c 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -790,6 +790,9 @@ spec: fieldPath: status.hostIP - name: SIDECAR_SCRAPING_ENABLED value: "true" + # Add this only for clouds that require cert bootstrapping + - name: REQUIRES_CERT_BOOTSTRAP + value: "true" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers @@ -805,6 +808,10 @@ spec: - mountPath: C:\etc\config\adx name: omsagent-adx-secret readOnly: true + # Need to mount this only for airgapped clouds - Commenting this since it wont exist in non airgapped clouds + # - mountPath: C:\ca + # name: ca-certs + # readOnly: true livenessProbe: exec: command: @@ -836,6 +843,10 @@ spec: - name: docker-windows-kuberenetes-container-logs hostPath: path: C:\var + # Need to mount this only for airgapped clouds - Commenting this since it wont exist in non airgapped clouds + #- name: ca-certs + # hostPath: + # path: C:\ca - name: docker-windows-containers hostPath: path: C:\ProgramData\docker\containers diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 95cba2579..baf95fca4 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -119,6 +119,25 @@ function Set-EnvironmentVariables { $env:AZMON_AGENT_CFG_SCHEMA_VERSION } + # Need to do this before the SA fetch for AI key for airgapped clouds so that it is not overwritten with defaults. + $appInsightsAuth = [System.Environment]::GetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", "process") + if (![string]::IsNullOrEmpty($appInsightsAuth)) { + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_AUTH", $appInsightsAuth, "machine") + Write-Host "Successfully set environment variable APPLICATIONINSIGHTS_AUTH - $($appInsightsAuth) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable APPLICATIONINSIGHTS_AUTH for target 'machine' since it is either null or empty" + } + + $appInsightsEndpoint = [System.Environment]::GetEnvironmentVariable("APPLICATIONINSIGHTS_ENDPOINT", "process") + if (![string]::IsNullOrEmpty($appInsightsEndpoint)) { + [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_ENDPOINT", $appInsightsEndpoint, "machine") + Write-Host "Successfully set environment variable APPLICATIONINSIGHTS_ENDPOINT - $($appInsightsEndpoint) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable APPLICATIONINSIGHTS_ENDPOINT for target 'machine' since it is either null or empty" + } + # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) $aiKeyURl = [System.Environment]::GetEnvironmentVariable('APPLICATIONINSIGHTS_AUTH_URL') if ($aiKeyURl) { @@ -161,7 +180,6 @@ function Set-EnvironmentVariables { [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Process") [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Machine") - # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb .\setenv.ps1 @@ -404,12 +422,41 @@ function Test-CertificatePath { } } +function Bootstrap-CACertificates { + try { + # This is required when the root CA certs are different for some clouds. + $certMountPath = "C:\ca" + Get-ChildItem $certMountPath | + Foreach-Object { + $absolutePath=$_.FullName + Write-Host "cert path: $($absolutePath)" + Import-Certificate -FilePath $absolutePath -CertStoreLocation 'Cert:\LocalMachine\Root' -Verbose + } + } + catch { + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in Bootstrap-CACertificates..." + } +} + Start-Transcript -Path main.txt Remove-WindowsServiceIfItExists "fluentdwinaks" Set-EnvironmentVariables Start-FileSystemWatcher +#Bootstrapping CA certs for non public clouds and AKS clusters +$aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID") +$requiresCertBootstrap = [System.Environment]::GetEnvironmentVariable("REQUIRES_CERT_BOOTSTRAP") +if (![string]::IsNullOrEmpty($requiresCertBootstrap) -and ` + $requiresCertBootstrap.ToLower() -eq 'true' -and ` + ![string]::IsNullOrEmpty($aksResourceId) -and ` + $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) +{ + Bootstrap-CACertificates +} + Generate-Certificates Test-CertificatePath Start-Fluent-Telegraf From 0fa350e66edeb65b26a6354a425ad2322296f7d8 Mon Sep 17 00:00:00 2001 From: saaror <31900410+saaror@users.noreply.github.com> Date: Fri, 21 May 2021 16:02:58 -0700 Subject: [PATCH 104/194] Update ReadMe.md (#565) * Update ReadMe.md * Update ReadMe.md Included feedback from OSM team and Fixed --- Documentation/OSMPrivatePreview/ReadMe.md | 37 ++++++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/Documentation/OSMPrivatePreview/ReadMe.md b/Documentation/OSMPrivatePreview/ReadMe.md index aa90c7413..da125a35c 100644 --- a/Documentation/OSMPrivatePreview/ReadMe.md +++ b/Documentation/OSMPrivatePreview/ReadMe.md @@ -1,15 +1,17 @@ Note - This is private preview. For any support issues, please reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com). Please don't open a support ticket. +This private preview supports Open Service Mesh on [AKS](https://docs.microsoft.com/azure/aks/servicemesh-osm-about) & Azure [Arc on k8s](http://docs.microsoft.com/azure/azure-arc/kubernetes/tutorial-arc-enabled-osm). + # Azure Monitor Container Insights Open Service Mesh Monitoring Azure Monitor container insights now supporting preview of [Open Service Mesh(OSM)](https://docs.microsoft.com/azure/aks/servicemesh-osm-about) Monitoring. As part of this support, customer can: 1. Filter & view inventory of all the services that are part of your service mesh. 2. Visualize and monitor requests between services in your service mesh, with request latency, error rate & resource utilization by services. -3. Provides connection summary for OSM infrastructure running on AKS. +3. Provides connection summary for OSM infrastructure running on AKS or Azure Arc for k8s. ## How to onboard Container Insights OSM monitoring? OSM exposes Prometheus metrics which Container Insights can collect, for container insights agent to collect OSM metrics follow the following steps. - +### AKS 1. Follow this [link](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#register-the-aks-openservicemesh-preview-feature) as a prereq before enabling the addon. 2. Enable AKS OSM addon on your @@ -27,9 +29,29 @@ osm metrics enable --namespace "test1, test2" * Download the configmap from [here](https://github.com/microsoft/Docker-Provider/blob/ci_prod/kubernetes/container-azm-ms-osmconfig.yaml) * Add the namespaces you want to monitor in configmap `monitor_namespaces = ["namespace1", "namespace2"]` * Run the following kubectl command: kubectl apply -f - * Example: `kubectl apply -f container-azm-ms-agentconfig.yaml` + * Example: `kubectl apply -f container-azm-ms-osmconfig.yaml` 4. The configuration change can take upto 15 mins to finish before taking effect, and all omsagent pods in the cluster will restart. The restart is a rolling restart for all omsagent pods, not all restart at the same time. +### Azure Arc for Kuberentes +This section assumes that you already have your kubernetes distribution connected via Azure Arc. If not learn more [here.](https://docs.microsoft.com/en-us/azure/azure-arc/kubernetes/quickstart-connect-cluster) + +1. Install Arc enabled Open Service mesh on your Arc cluster. Learn more [here](http://docs.microsoft.com/azure/azure-arc/kubernetes/tutorial-arc-enabled-osm#install-arc-enabled-open-service-mesh-osm-on-an-arc-enabled-kubernetes-cluster) +2. Install Azure Monitor Container Insights on Arc. If not installed already. Learn more how to install [here](https://docs.microsoft.com/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters) +3. Ensure that prometheus_scraping is set to true in the OSM configmap. +3. Ensure that the application namespaces that you wish to be monitored are onboarded to the mesh. Follow the guidance available [here.](http://docs.microsoft.com/azure/azure-arc/kubernetes/tutorial-arc-enabled-osm#onboard-namespaces-to-the-service-mesh) +4. To enable namespace(s), download the osm client library [here](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#osm-service-quotas-and-limits-preview) & then enable metrics on namespaces +```bash +# With osm +osm metrics enable --namespace test +osm metrics enable --namespace "test1, test2" + +``` +4. On your Azure Monitor Container Insights for Arc. + * Download the configmap from [here](https://github.com/microsoft/Docker-Provider/blob/ci_prod/kubernetes/container-azm-ms-osmconfig.yaml) + * Add the namespaces you want to monitor in configmap `monitor_namespaces = ["namespace1", "namespace2"]` + * Run the following kubectl command: kubectl apply -f + * Example: `kubectl apply -f container-azm-ms-osmconfig.yaml` +5. The configuration change can take upto 15 mins to finish before taking effect, and all omsagent pods in the cluster will restart. The restart is a rolling restart for all omsagent pods, not all restart at the same time. ## Validate the metrics flow 1. Query cluster's Log Analytics workspace InsightsMetrics table to see metrics are flowing or not @@ -41,8 +63,9 @@ InsightsMetrics ## How to consume OSM monitoring dashboard? 1. Access your AKS cluster & Container Insights through this [link.](https://aka.ms/azmon/osmux) -2. Go to reports tab and access Open Service Mesh (OSM) workbook. -3. Select the time-range & namespace to scope your services. By default, we only show services deployed by customers and we exclude internal service communication. In case you want to view that you select Show All in the filter. Please note OSM is managed service mesh, we show all internal connections for transparency. + * For **Azure Arc for k8s**, access Container Insights through this [link.](https://aka.ms/azmon/osmarcux) +3. Go to reports tab and access Open Service Mesh (OSM) workbook. +4. Select the time-range & namespace to scope your services. By default, we only show services deployed by customers and we exclude internal service communication. In case you want to view that you select Show All in the filter. Please note OSM is managed service mesh, we show all internal connections for transparency. ![alt text](https://github.com/microsoft/Docker-Provider/blob/saarorOSMdoc/Documentation/OSMPrivatePreview/Image1.jpg) ### Requests Tab @@ -51,6 +74,8 @@ InsightsMetrics 3. You can view total requests, request error rate & P90 latency. 4. You can drill-down to destination and view trends for HTTP error/success code, success rate, Pods resource utilization, latencies at different percentiles. +![image](https://user-images.githubusercontent.com/31900410/119195241-2e712000-ba39-11eb-8cb0-2d7d16e26d1b.png) + ### Connections Tab 1. This tab provides you a summary of all the connections between your services in Open Service Mesh. 2. Outbound connections: Total number of connections between Source and destination services. @@ -68,4 +93,6 @@ InsightsMetrics 2. When source or destination is osmcontroller we show no latency & for internal services we show no resource utilization. 3. When both prometheus scraping using pod annotations and OSM monitoring are enabled on the same set of namespaces, the default set of metrics (envoy_cluster_upstream_cx_total, envoy_cluster_upstream_cx_connect_fail, envoy_cluster_upstream_rq, envoy_cluster_upstream_rq_xx, envoy_cluster_upstream_rq_total, envoy_cluster_upstream_rq_time_bucket, envoy_cluster_upstream_cx_rx_bytes_total, envoy_cluster_upstream_cx_tx_bytes_total, envoy_cluster_upstream_cx_active) will be collected twice. You can follow [this](https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-prometheus-integration#prometheus-scraping-settings) documentation to exclude these namespaces from pod annotation scraping using the setting monitor_kubernetes_pods_namespaces to work around this issue. +4. For monitoring on **Azure Arc on k8s** currently there is a separate link to access OSM workbook. We plan to have one single link to access workbook on both platforms by 10th June 2021. + This is private preview, the goal for us is to get feedback. Please feel free to reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com) for any feedback and questions! From c7075394ce704193a0198a5e7b93a5b6d7186054 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 21 May 2021 18:26:24 -0700 Subject: [PATCH 105/194] Gangams/aad stage2 full switch to mdsd (#559) * full switch to mdsd, upgrade to ruby v1 & omsagent removal * add odsdirect as fallback option * cleanup * cleanup * move customRegion to stage3 * updates related to containerlog route * make xml eventschema consistent * add buffer settings * address HTTPServerException deprecation in ruby 2.6 * update to official mdsd version * fix log message issue * fix pr feedback * get ridoff unused code from omscommon * fix pr feedback * fix pr feedback * clean up * clean up * fix missing conf --- build/common/installer/scripts/tomlparser.rb | 16 +- build/linux/installer/conf/container.conf | 318 ++--- build/linux/installer/conf/kube.conf | 509 +++++--- build/linux/installer/conf/out_oms.conf | 5 +- .../installer/datafiles/base_container.data | 297 +++-- build/linux/installer/datafiles/linux.data | 18 +- .../linux/installer/datafiles/linux_dpkg.data | 2 +- .../linux/installer/datafiles/linux_rpm.data | 2 +- .../linux/installer/scripts/livenessprobe.sh | 18 +- .../scripts/tomlparser-mdm-metrics-config.rb | 2 +- .../tomlparser-metric-collection-config.rb | 2 +- kubernetes/linux/envmdsd | 2 - kubernetes/linux/main.sh | 285 ++--- kubernetes/linux/mdsd.xml | 345 +++++- kubernetes/linux/setup.sh | 52 +- source/plugins/go/src/oms.go | 386 ++++-- source/plugins/go/src/telemetry.go | 17 + source/plugins/go/src/utils.go | 114 +- .../ruby/ApplicationInsightsUtility.rb | 22 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 212 ++-- source/plugins/ruby/DockerApiClient.rb | 2 +- source/plugins/ruby/KubernetesApiClient.rb | 100 +- source/plugins/ruby/MdmMetricsGenerator.rb | 16 +- source/plugins/ruby/constants.rb | 2 +- source/plugins/ruby/filter_cadvisor2mdm.rb | 111 +- .../ruby/filter_cadvisor_health_container.rb | 15 +- .../ruby/filter_cadvisor_health_node.rb | 28 +- source/plugins/ruby/filter_container.rb | 59 - source/plugins/ruby/filter_docker_log.rb | 103 -- .../ruby/filter_health_model_builder.rb | 43 +- source/plugins/ruby/filter_inventory2mdm.rb | 24 +- source/plugins/ruby/filter_telegraf2mdm.rb | 8 +- ...h_container_cpu_memory_record_formatter.rb | 8 +- .../ruby/health/health_monitor_utils.rb | 12 +- source/plugins/ruby/in_cadvisor_perf.rb | 42 +- source/plugins/ruby/in_containerinventory.rb | 29 +- source/plugins/ruby/in_kube_events.rb | 31 +- source/plugins/ruby/in_kube_health.rb | 16 +- source/plugins/ruby/in_kube_nodes.rb | 111 +- source/plugins/ruby/in_kube_podinventory.rb | 116 +- source/plugins/ruby/in_kube_pvinventory.rb | 37 +- .../plugins/ruby/in_kubestate_deployments.rb | 37 +- source/plugins/ruby/in_kubestate_hpa.rb | 33 +- source/plugins/ruby/in_win_cadvisor_perf.rb | 28 +- source/plugins/ruby/out_health_forward.rb | 1074 ++++++++++------- source/plugins/ruby/out_mdm.rb | 85 +- source/plugins/ruby/podinventory_to_mdm.rb | 10 +- source/plugins/utils/oms_common.rb | 143 +++ source/plugins/utils/omslog.rb | 50 + 49 files changed, 2821 insertions(+), 2176 deletions(-) delete mode 100644 source/plugins/ruby/filter_container.rb delete mode 100644 source/plugins/ruby/filter_docker_log.rb create mode 100644 source/plugins/utils/oms_common.rb create mode 100644 source/plugins/utils/omslog.rb diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index a0f3c2f0a..b173ecfe3 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -25,8 +25,10 @@ @enrichContainerLogs = false @containerLogSchemaVersion = "" @collectAllKubeEvents = false -@containerLogsRoute = "" - +@containerLogsRoute = "v2" # default for linux +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @containerLogsRoute = "v1" # default is v1 for windows until windows agent integrates windows ama +end # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap begin @@ -162,8 +164,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) #Get container logs route setting begin if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil? - @containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version] - puts "config::Using config map setting for container logs route" + if !parsedConfig[:log_collection_settings][:route_container_logs][:version].empty? + @containerLogsRoute = parsedConfig[:log_collection_settings][:route_container_logs][:version] + puts "config::Using config map setting for container logs route: #{@containerLogsRoute}" + else + puts "config::Ignoring config map settings and using default value since provided container logs route value is empty" + end end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for container logs route - #{errorStr}, using defaults, please check config map for errors") @@ -256,7 +262,7 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) - commands = get_command_windows('AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE', @containerLogsRoute) + commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion) file.write(commands) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index 958a85eb6..093c9ef12 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -1,141 +1,179 @@ -# Fluentd config file for OMS Docker - container components (non kubeAPI) - -# Forward port 25225 for container logs - - type forward - port 25225 - bind 127.0.0.1 - - -# MDM metrics from telegraf - - @type tcp - tag oms.mdm.container.perf.telegraf.* - bind 0.0.0.0 - port 25228 - format json - - -# Container inventory - - type containerinventory - tag oms.containerinsights.containerinventory - run_interval 60 - log_level debug - - -#cadvisor perf - - type cadvisorperf - tag oms.api.cadvisorperf - run_interval 60 - log_level debug - - - - type filter_cadvisor_health_node - log_level debug - - - - type filter_cadvisor_health_container - log_level debug - - -#custom_metrics_mdm filter plugin - - type filter_cadvisor2mdm - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes - log_level info - - - - type filter_telegraf2mdm - log_level debug - - - - type out_oms - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containerinventory*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - - type out_oms - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_cadvisorperf*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - - - @type health_forward - send_timeout 60s - recover_wait 10s - hard_timeout 60s - heartbeat_type tcp - skip_network_error_at_init true - expire_dns_cache 600s - buffer_queue_full_action drop_oldest_chunk - buffer_type file - buffer_path %STATE_DIR_WS%/out_health_forward*.buffer - buffer_chunk_limit 3m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}" - port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" - - - + # Fluentd config file for OMS Docker - container components (non kubeAPI) + + # Forward port 25225 for container logs + # gangams - not used and get ridoff after confirming safe to remove + + @type forward + port 25225 + bind 127.0.0.1 + + + # MDM metrics from telegraf + + @type tcp + tag oms.mdm.container.perf.telegraf.* + bind 0.0.0.0 + port 25228 + format json + + + # Container inventory + + @type containerinventory + tag oneagent.containerInsights.CONTAINER_INVENTORY_BLOB + run_interval 60 + @log_level debug + + + #cadvisor perf + + @type cadvisor_perf + tag oneagent.containerInsights.LINUX_PERF_BLOB + run_interval 60 + @log_level debug + + + + @type cadvisor_health_node + @log_level debug + + + + @type cadvisor_health_container + @log_level debug + + + #custom_metrics_mdm filter plugin + + @type cadvisor2mdm + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes + @log_level info + + + + @type telegraf2mdm + @log_level debug + + + #containerinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + @type file - path %STATE_DIR_WS%/fluent_forward_failed.buffer - - - - - type out_mdm - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - retry_mdm_post_wait_minutes 30 - - - - type out_oms - log_level debug - num_threads 5 - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer - buffer_queue_full_action drop_oldest_chunk - buffer_chunk_limit 4m - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - + path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + + #cadvisorperf + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/cadvisorperf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + + + @type health_forward + send_timeout 60s + recover_wait 10s + hard_timeout 60s + transport tcp + ignore_network_errors_at_startup true + expire_dns_cache 600s + + @type file + overflow_action drop_oldest_chunk + path /var/opt/microsoft/docker-cimprov/state/out_health_forward*.buffer + chunk_limit_size 3m + flush_interval 20s + retry_max_times 10 + retry_max_interval 5m + retry_wait 5s + + + host "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_HOST']}" + port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/fluent_forward_failed.buffer + + + + + @type mdm + @log_level debug + + @type file + path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + retry_mdm_post_wait_minutes 30 + + + #InsightsMetrics + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index fb566c360..a1c8bf928 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -1,7 +1,6 @@ -# Fluentd config file for OMS Docker - cluster components (kubeAPI) #fluent forward plugin - type forward + @type forward port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" bind 0.0.0.0 chunk_size_limit 4m @@ -9,262 +8,378 @@ #Kubernetes pod inventory - type kubepodinventory - tag oms.containerinsights.KubePodInventory + @type kube_podinventory + tag oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes Persistent Volume inventory - type kubepvinventory - tag oms.containerinsights.KubePVInventory + @type kube_pvinventory + tag oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes events - type kubeevents - tag oms.containerinsights.KubeEvents + @type kube_events + tag oneagent.containerInsights.KUBE_EVENTS_BLOB run_interval 60 - log_level debug - + @log_level debug + #Kubernetes Nodes - type kubenodeinventory - tag oms.containerinsights.KubeNodeInventory + @type kube_nodes + tag oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes health - type kubehealth + @type kube_health tag kubehealth.ReplicaSet run_interval 60 - log_level debug + @log_level debug #cadvisor perf- Windows nodes - type wincadvisorperf - tag oms.api.wincadvisorperf + @type win_cadvisor_perf + tag oneagent.containerInsights.LINUX_PERF_BLOB run_interval 60 - log_level debug + @log_level debug #Kubernetes object state - deployments - - type kubestatedeployments - tag oms.containerinsights.KubeStateDeployments - run_interval 60 - log_level debug - + + @type kubestate_deployments + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + - #Kubernetes object state - HPA - - type kubestatehpa - tag oms.containerinsights.KubeStateHpa - run_interval 60 - log_level debug - + #Kubernetes object state - HPA + + @type kubestate_hpa + tag oneagent.containerInsights.INSIGHTS_METRICS_BLOB + run_interval 60 + @log_level debug + - type filter_inventory2mdm - log_level info + @type inventory2mdm + @log_level info #custom_metrics_mdm filter plugin for perf data from windows nodes - type filter_cadvisor2mdm + @type cadvisor2mdm metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes - log_level info + @log_level info #health model aggregation filter - type filter_health_model_builder + @type health_model_builder - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #kubepodinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubepod*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - + #kubepvinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubepv*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #InsightsMetrics + #kubestate + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - type out_oms - log_level debug - num_threads 2 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #kubeevents + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubeevents*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + #kubeservices + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubeservices*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 2 + + keepalive true + - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #kubenodeinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubenode*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - type out_oms - log_level debug - num_threads 3 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #containernodeinventory + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/containernodeinventory*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 3 + + keepalive true - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + #containerinventory for windows containers + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/containerinventory*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + + #perf + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/perf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + @type mdm + @log_level debug + + @type file + path /var/opt/microsoft/docker-cimprov/state/out_mdm_*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + retry_mdm_post_wait_minutes 30 - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + @type mdm + @log_level debug + + @type file + path /var/opt/microsoft/docker-cimprov/state/out_mdm_cdvisorperf*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + retry_mdm_post_wait_minutes 30 - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubehealth*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m + + #kubehealth + + @type forward + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/kubehealth*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length 20 + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_insightsmetrics*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - \ No newline at end of file diff --git a/build/linux/installer/conf/out_oms.conf b/build/linux/installer/conf/out_oms.conf index 74ba3195e..21dc4c1ed 100644 --- a/build/linux/installer/conf/out_oms.conf +++ b/build/linux/installer/conf/out_oms.conf @@ -1,10 +1,9 @@ -omsadmin_conf_path=/etc/opt/microsoft/omsagent/conf/omsadmin.conf omsproxy_secret_path=/etc/omsagent-secret/PROXY adx_cluster_uri_path=/etc/config/settings/adx/ADXCLUSTERURI adx_client_id_path=/etc/config/settings/adx/ADXCLIENTID adx_tenant_id_path=/etc/config/settings/adx/ADXTENANTID adx_client_secret_path=/etc/config/settings/adx/ADXCLIENTSECRET -cert_file_path=/etc/opt/microsoft/omsagent/certs/oms.crt -key_file_path=/etc/opt/microsoft/omsagent/certs/oms.key +cert_file_path=/etc/mdsd.d/oms/%s/oms.crt +key_file_path=/etc/mdsd.d/oms/%s/oms.key container_host_file_path=/var/opt/microsoft/docker-cimprov/state/containerhostname container_inventory_refresh_interval=60 diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index df8fbc3da..b9f889dba 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -18,89 +18,8 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/conf/installinfo.txt; build/linux/installer/conf/installinfo.txt; 644; root; root; conffile -/opt/microsoft/omsagent/plugin/filter_docker_log.rb; source/plugins/ruby/filter_docker_log.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_container.rb; source/plugins/ruby/filter_container.rb; 644; root; root - -/opt/microsoft/omsagent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root -/opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root - /etc/opt/microsoft/docker-cimprov/container.conf; build/linux/installer/conf/container.conf; 644; root; root -/opt/microsoft/omsagent/plugin/CAdvisorMetricsAPIClient.rb; source/plugins/ruby/CAdvisorMetricsAPIClient.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_cadvisor_perf.rb; source/plugins/ruby/in_cadvisor_perf.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_win_cadvisor_perf.rb; source/plugins/ruby/in_win_cadvisor_perf.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kubestate_deployments.rb; source/plugins/ruby/in_kubestate_deployments.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kubestate_hpa.rb; source/plugins/ruby/in_kubestate_hpa.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_inventory2mdm.rb; source/plugins/ruby/filter_inventory2mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/podinventory_to_mdm.rb; source/plugins/ruby/podinventory_to_mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/kubelet_utils.rb; source/plugins/ruby/kubelet_utils.rb; 644; root; root -/opt/microsoft/omsagent/plugin/CustomMetricsUtils.rb; source/plugins/ruby/CustomMetricsUtils.rb; 644; root; root -/opt/microsoft/omsagent/plugin/constants.rb; source/plugins/ruby/constants.rb; 644; root; root -/opt/microsoft/omsagent/plugin/MdmAlertTemplates.rb; source/plugins/ruby/MdmAlertTemplates.rb; 644; root; root -/opt/microsoft/omsagent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby/MdmMetricsGenerator.rb; 644; root; root - - -/opt/microsoft/omsagent/plugin/ApplicationInsightsUtility.rb; source/plugins/ruby/ApplicationInsightsUtility.rb; 644; root; root -/opt/microsoft/omsagent/plugin/ContainerInventoryState.rb; source/plugins/ruby/ContainerInventoryState.rb; 644; root; root -/opt/microsoft/omsagent/plugin/DockerApiClient.rb; source/plugins/ruby/DockerApiClient.rb; 644; root; root -/opt/microsoft/omsagent/plugin/DockerApiRestHelper.rb; source/plugins/ruby/DockerApiRestHelper.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/kubernetes_container_inventory.rb; source/plugins/ruby/kubernetes_container_inventory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/proxy_utils.rb; source/plugins/ruby/proxy_utils.rb; 644; root; root - -/opt/microsoft/omsagent/plugin/arc_k8s_cluster_identity.rb; source/plugins/ruby/arc_k8s_cluster_identity.rb; 644; root; root -/opt/microsoft/omsagent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_cadvisor2mdm.rb; source/plugins/ruby/filter_cadvisor2mdm.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_telegraf2mdm.rb; source/plugins/ruby/filter_telegraf2mdm.rb; 644; root; root - -/opt/microsoft/omsagent/plugin/lib/application_insights/version.rb; source/plugins/ruby/lib/application_insights/version.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/rack/track_request.rb; source/plugins/ruby/lib/application_insights/rack/track_request.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/unhandled_exception.rb; source/plugins/ruby/lib/application_insights/unhandled_exception.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/telemetry_client.rb; source/plugins/ruby/lib/application_insights/telemetry_client.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/queue_base.rb; source/plugins/ruby/lib/application_insights/channel/queue_base.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/asynchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_queue.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/synchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data_point_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point_type.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data_point.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/stack_frame.rb; source/plugins/ruby/lib/application_insights/channel/contracts/stack_frame.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/request_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/request_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/session.rb; source/plugins/ruby/lib/application_insights/channel/contracts/session.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/page_view_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/remote_dependency_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/exception_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/location.rb; source/plugins/ruby/lib/application_insights/channel/contracts/location.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/operation.rb; source/plugins/ruby/lib/application_insights/channel/contracts/operation.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/event_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/event_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/metric_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/metric_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/device.rb; source/plugins/ruby/lib/application_insights/channel/contracts/device.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/message_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/message_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_source_type.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/user.rb; source/plugins/ruby/lib/application_insights/channel/contracts/user.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/severity_level.rb; source/plugins/ruby/lib/application_insights/channel/contracts/severity_level.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/application.rb; source/plugins/ruby/lib/application_insights/channel/contracts/application.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/dependency_kind.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_kind.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/cloud.rb; source/plugins/ruby/lib/application_insights/channel/contracts/cloud.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/envelope.rb; source/plugins/ruby/lib/application_insights/channel/contracts/envelope.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/json_serializable.rb; source/plugins/ruby/lib/application_insights/channel/contracts/json_serializable.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/domain.rb; source/plugins/ruby/lib/application_insights/channel/contracts/domain.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/base.rb; source/plugins/ruby/lib/application_insights/channel/contracts/base.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/reopenings.rb; source/plugins/ruby/lib/application_insights/channel/contracts/reopenings.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_perf_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/internal.rb; source/plugins/ruby/lib/application_insights/channel/contracts/internal.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/availability_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/availability_data.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts/exception_details.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_details.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/synchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_queue.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/sender_base.rb; source/plugins/ruby/lib/application_insights/channel/sender_base.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/telemetry_context.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_context.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/asynchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/telemetry_channel.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_channel.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/event.rb; source/plugins/ruby/lib/application_insights/channel/event.rb; 644; root; root -/opt/microsoft/omsagent/plugin/lib/application_insights.rb; source/plugins/ruby/lib/application_insights.rb; 644; root; root - /opt/tomlrb.rb; source/toml-parser/tomlrb.rb; 644; root; root /opt/tomlrb/generated_parser.rb; source/toml-parser/tomlrb/generated_parser.rb; 644; root; root /opt/tomlrb/handler.rb; source/toml-parser/tomlrb/handler.rb; 644; root; root @@ -126,6 +45,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root + /opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root @@ -134,43 +54,127 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root -/opt/microsoft/omsagent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_cadvisor_health_node.rb; source/plugins/ruby/filter_cadvisor_health_node.rb; 644; root; root -/opt/microsoft/omsagent/plugin/filter_health_model_builder.rb; source/plugins/ruby/filter_health_model_builder.rb; 644; root; root -/opt/microsoft/omsagent/plugin/in_kube_health.rb; source/plugins/ruby/in_kube_health.rb; 644; root; root -/opt/microsoft/omsagent/plugin/out_health_forward.rb; source/plugins/ruby/out_health_forward.rb; 644; root; root /etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json; build/linux/installer/conf/healthmonitorconfig.json; 644; root; root /etc/opt/microsoft/docker-cimprov/health/health_model_definition.json; build/linux/installer/conf/health_model_definition.json; 644; root; root -/opt/microsoft/omsagent/plugin/health/aggregate_monitor.rb; source/plugins/ruby/health/aggregate_monitor.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/agg_monitor_id_labels.rb; source/plugins/ruby/health/agg_monitor_id_labels.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/aggregate_monitor_state_finalizer.rb; source/plugins/ruby/health/aggregate_monitor_state_finalizer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/cluster_health_state.rb; source/plugins/ruby/health/cluster_health_state.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_container_cpu_memory_aggregator.rb; source/plugins/ruby/health/health_container_cpu_memory_aggregator.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_container_cpu_memory_record_formatter.rb; source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_hierarchy_builder.rb; source/plugins/ruby/health/health_hierarchy_builder.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_kubernetes_resources.rb; source/plugins/ruby/health/health_kubernetes_resources.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_kube_api_down_handler.rb; source/plugins/ruby/health/health_kube_api_down_handler.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_missing_signal_generator.rb; source/plugins/ruby/health/health_missing_signal_generator.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_buffer.rb; source/plugins/ruby/health/health_model_buffer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_builder.rb; source/plugins/ruby/health/health_model_builder.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_constants.rb; source/plugins/ruby/health/health_model_constants.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/parent_monitor_provider.rb; source/plugins/ruby/health/parent_monitor_provider.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_model_definition_parser.rb; source/plugins/ruby/health/health_model_definition_parser.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_helpers.rb; source/plugins/ruby/health/health_monitor_helpers.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_optimizer.rb; source/plugins/ruby/health/health_monitor_optimizer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_provider.rb; source/plugins/ruby/health/health_monitor_provider.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_record.rb; source/plugins/ruby/health/health_monitor_record.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_state.rb; source/plugins/ruby/health/health_monitor_state.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_telemetry.rb; source/plugins/ruby/health/health_monitor_telemetry.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_monitor_utils.rb; source/plugins/ruby/health/health_monitor_utils.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/health_signal_reducer.rb; source/plugins/ruby/health/health_signal_reducer.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/monitor_factory.rb; source/plugins/ruby/health/monitor_factory.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/monitor_set.rb; source/plugins/ruby/health/monitor_set.rb; 644; root; root -/opt/microsoft/omsagent/plugin/health/unit_monitor.rb; source/plugins/ruby/health/unit_monitor.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/version.rb; source/plugins/ruby/lib/application_insights/version.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/rack/track_request.rb; source/plugins/ruby/lib/application_insights/rack/track_request.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/unhandled_exception.rb; source/plugins/ruby/lib/application_insights/unhandled_exception.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/telemetry_client.rb; source/plugins/ruby/lib/application_insights/telemetry_client.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/queue_base.rb; source/plugins/ruby/lib/application_insights/channel/queue_base.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/asynchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_queue.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/synchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/data_point_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point_type.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/data_point.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data_point.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/stack_frame.rb; source/plugins/ruby/lib/application_insights/channel/contracts/stack_frame.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/request_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/request_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/session.rb; source/plugins/ruby/lib/application_insights/channel/contracts/session.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/page_view_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/remote_dependency_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/remote_dependency_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/exception_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/location.rb; source/plugins/ruby/lib/application_insights/channel/contracts/location.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/operation.rb; source/plugins/ruby/lib/application_insights/channel/contracts/operation.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/event_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/event_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/metric_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/metric_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/device.rb; source/plugins/ruby/lib/application_insights/channel/contracts/device.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/message_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/message_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/dependency_source_type.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_source_type.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/user.rb; source/plugins/ruby/lib/application_insights/channel/contracts/user.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/severity_level.rb; source/plugins/ruby/lib/application_insights/channel/contracts/severity_level.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/application.rb; source/plugins/ruby/lib/application_insights/channel/contracts/application.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/dependency_kind.rb; source/plugins/ruby/lib/application_insights/channel/contracts/dependency_kind.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/cloud.rb; source/plugins/ruby/lib/application_insights/channel/contracts/cloud.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/envelope.rb; source/plugins/ruby/lib/application_insights/channel/contracts/envelope.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/json_serializable.rb; source/plugins/ruby/lib/application_insights/channel/contracts/json_serializable.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/domain.rb; source/plugins/ruby/lib/application_insights/channel/contracts/domain.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/base.rb; source/plugins/ruby/lib/application_insights/channel/contracts/base.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/reopenings.rb; source/plugins/ruby/lib/application_insights/channel/contracts/reopenings.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/page_view_perf_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/page_view_perf_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/internal.rb; source/plugins/ruby/lib/application_insights/channel/contracts/internal.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/availability_data.rb; source/plugins/ruby/lib/application_insights/channel/contracts/availability_data.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/contracts/exception_details.rb; source/plugins/ruby/lib/application_insights/channel/contracts/exception_details.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/synchronous_queue.rb; source/plugins/ruby/lib/application_insights/channel/synchronous_queue.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/sender_base.rb; source/plugins/ruby/lib/application_insights/channel/sender_base.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/telemetry_context.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_context.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/asynchronous_sender.rb; source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/telemetry_channel.rb; source/plugins/ruby/lib/application_insights/channel/telemetry_channel.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights/channel/event.rb; source/plugins/ruby/lib/application_insights/channel/event.rb; 644; root; root +/etc/fluent/plugin/lib/application_insights.rb; source/plugins/ruby/lib/application_insights.rb; 644; root; root + +/etc/fluent/plugin/health/aggregate_monitor.rb; source/plugins/ruby/health/aggregate_monitor.rb; 644; root; root +/etc/fluent/plugin/health/agg_monitor_id_labels.rb; source/plugins/ruby/health/agg_monitor_id_labels.rb; 644; root; root +/etc/fluent/plugin/health/aggregate_monitor_state_finalizer.rb; source/plugins/ruby/health/aggregate_monitor_state_finalizer.rb; 644; root; root +/etc/fluent/plugin/health/cluster_health_state.rb; source/plugins/ruby/health/cluster_health_state.rb; 644; root; root +/etc/fluent/plugin/health/health_container_cpu_memory_aggregator.rb; source/plugins/ruby/health/health_container_cpu_memory_aggregator.rb; 644; root; root +/etc/fluent/plugin/health/health_container_cpu_memory_record_formatter.rb; source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb; 644; root; root +/etc/fluent/plugin/health/health_hierarchy_builder.rb; source/plugins/ruby/health/health_hierarchy_builder.rb; 644; root; root +/etc/fluent/plugin/health/health_kubernetes_resources.rb; source/plugins/ruby/health/health_kubernetes_resources.rb; 644; root; root +/etc/fluent/plugin/health/health_kube_api_down_handler.rb; source/plugins/ruby/health/health_kube_api_down_handler.rb; 644; root; root +/etc/fluent/plugin/health/health_missing_signal_generator.rb; source/plugins/ruby/health/health_missing_signal_generator.rb; 644; root; root +/etc/fluent/plugin/health/health_model_buffer.rb; source/plugins/ruby/health/health_model_buffer.rb; 644; root; root +/etc/fluent/plugin/health/health_model_builder.rb; source/plugins/ruby/health/health_model_builder.rb; 644; root; root +/etc/fluent/plugin/health/health_model_constants.rb; source/plugins/ruby/health/health_model_constants.rb; 644; root; root +/etc/fluent/plugin/health/parent_monitor_provider.rb; source/plugins/ruby/health/parent_monitor_provider.rb; 644; root; root +/etc/fluent/plugin/health/health_model_definition_parser.rb; source/plugins/ruby/health/health_model_definition_parser.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_helpers.rb; source/plugins/ruby/health/health_monitor_helpers.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_optimizer.rb; source/plugins/ruby/health/health_monitor_optimizer.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_provider.rb; source/plugins/ruby/health/health_monitor_provider.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_record.rb; source/plugins/ruby/health/health_monitor_record.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_state.rb; source/plugins/ruby/health/health_monitor_state.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_telemetry.rb; source/plugins/ruby/health/health_monitor_telemetry.rb; 644; root; root +/etc/fluent/plugin/health/health_monitor_utils.rb; source/plugins/ruby/health/health_monitor_utils.rb; 644; root; root +/etc/fluent/plugin/health/health_signal_reducer.rb; source/plugins/ruby/health/health_signal_reducer.rb; 644; root; root +/etc/fluent/plugin/health/monitor_factory.rb; source/plugins/ruby/health/monitor_factory.rb; 644; root; root +/etc/fluent/plugin/health/monitor_set.rb; source/plugins/ruby/health/monitor_set.rb; 644; root; root +/etc/fluent/plugin/health/unit_monitor.rb; source/plugins/ruby/health/unit_monitor.rb; 644; root; root + +/etc/fluent/plugin/ApplicationInsightsUtility.rb; source/plugins/ruby/ApplicationInsightsUtility.rb; 644; root; root +/etc/fluent/plugin/arc_k8s_cluster_identity.rb; source/plugins/ruby/arc_k8s_cluster_identity.rb; 644; root; root +/etc/fluent/plugin/CAdvisorMetricsAPIClient.rb; source/plugins/ruby/CAdvisorMetricsAPIClient.rb; 644; root; root +/etc/fluent/plugin/constants.rb; source/plugins/ruby/constants.rb; 644; root; root +/etc/fluent/plugin/ContainerInventoryState.rb; source/plugins/ruby/ContainerInventoryState.rb; 644; root; root +/etc/fluent/plugin/CustomMetricsUtils.rb; source/plugins/ruby/CustomMetricsUtils.rb; 644; root; root +/etc/fluent/plugin/DockerApiClient.rb; source/plugins/ruby/DockerApiClient.rb; 644; root; root +/etc/fluent/plugin/DockerApiRestHelper.rb; source/plugins/ruby/DockerApiRestHelper.rb; 644; root; root +/etc/fluent/plugin/kubelet_utils.rb; source/plugins/ruby/kubelet_utils.rb; 644; root; root +/etc/fluent/plugin/proxy_utils.rb; source/plugins/ruby/proxy_utils.rb; 644; root; root +/etc/fluent/plugin/kubernetes_container_inventory.rb; source/plugins/ruby/kubernetes_container_inventory.rb; 644; root; root +/etc/fluent/plugin/podinventory_to_mdm.rb; source/plugins/ruby/podinventory_to_mdm.rb; 644; root; root +/etc/fluent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby/MdmMetricsGenerator.rb; 644; root; root +/etc/fluent/plugin/MdmAlertTemplates.rb; source/plugins/ruby/MdmAlertTemplates.rb; 644; root; root + +/etc/fluent/plugin/omslog.rb; source/plugins/utils/omslog.rb; 644; root; root +/etc/fluent/plugin/oms_common.rb; source/plugins/utils/oms_common.rb; 644; root; root + +/etc/fluent/kube.conf; build/linux/installer/conf/kube.conf; 644; root; root +/etc/fluent/container.conf; build/linux/installer/conf/container.conf; 644; root; root + +/etc/fluent/plugin/in_cadvisor_perf.rb; source/plugins/ruby/in_cadvisor_perf.rb; 644; root; root +/etc/fluent/plugin/in_win_cadvisor_perf.rb; source/plugins/ruby/in_win_cadvisor_perf.rb; 644; root; root +/etc/fluent/plugin/in_containerinventory.rb; source/plugins/ruby/in_containerinventory.rb; 644; root; root +/etc/fluent/plugin/in_kube_nodes.rb; source/plugins/ruby/in_kube_nodes.rb; 644; root; root +/etc/fluent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/etc/fluent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root +/etc/fluent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root +/etc/fluent/plugin/in_kube_health.rb; source/plugins/ruby/in_kube_health.rb; 644; root; root +/etc/fluent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root +/etc/fluent/plugin/in_kubestate_deployments.rb; source/plugins/ruby/in_kubestate_deployments.rb; 644; root; root +/etc/fluent/plugin/in_kubestate_hpa.rb; source/plugins/ruby/in_kubestate_hpa.rb; 644; root; root + +/etc/fluent/plugin/filter_cadvisor_health_container.rb; source/plugins/ruby/filter_cadvisor_health_container.rb; 644; root; root +/etc/fluent/plugin/filter_cadvisor_health_node.rb; source/plugins/ruby/filter_cadvisor_health_node.rb; 644; root; root +/etc/fluent/plugin/filter_cadvisor2mdm.rb; source/plugins/ruby/filter_cadvisor2mdm.rb; 644; root; root +/etc/fluent/plugin/filter_health_model_builder.rb; source/plugins/ruby/filter_health_model_builder.rb; 644; root; root +/etc/fluent/plugin/filter_inventory2mdm.rb; source/plugins/ruby/filter_inventory2mdm.rb; 644; root; root +/etc/fluent/plugin/filter_telegraf2mdm.rb; source/plugins/ruby/filter_telegraf2mdm.rb; 644; root; root + +/etc/fluent/plugin/out_health_forward.rb; source/plugins/ruby/out_health_forward.rb; 644; root; root +/etc/fluent/plugin/out_mdm.rb; source/plugins/ruby/out_mdm.rb; 644; root; root + + %Links -/opt/omi/lib/libcontainer.${{SHLIB_EXT}}; /opt/microsoft/docker-cimprov/lib/libcontainer.${{SHLIB_EXT}}; 644; root; root %Directories /etc; 755; root; root; sysdir @@ -179,27 +183,18 @@ MAINTAINER: 'Microsoft Corporation' /var; 755; root; root; sysdir /var/opt; 755; root; root; sysdir +/opt/fluent; 755; root; root; sysdir + /etc/opt/microsoft; 755; root; root; sysdir /etc/opt/microsoft/docker-cimprov; 755; root; root /etc/opt/microsoft/docker-cimprov/conf; 755; root; root /etc/opt/microsoft/docker-cimprov/health; 755; root; root -/etc/opt/omi; 755; root; root; sysdir -/etc/opt/omi/conf; 755; root; root; sysdir -/etc/opt/omi/conf/omiregister; 755; root; root; sysdir -/etc/opt/omi/conf/omiregister/root-cimv2; 755; root; root - /opt/microsoft; 755; root; root; sysdir /opt/microsoft/docker-cimprov; 755; root; root /opt/microsoft/docker-cimprov/bin; 755; root; root /opt/microsoft/docker-cimprov/lib; 755; root; root -/opt/microsoft/omsagent; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/health; 755; root; root; sysdir - -/opt/omi; 755; root; root; sysdir -/opt/omi/lib; 755; root; root; sysdir /var/opt/microsoft; 755; root; root; sysdir /var/opt/microsoft/docker-cimprov; 755; root; root @@ -213,11 +208,14 @@ MAINTAINER: 'Microsoft Corporation' /opt/td-agent-bit/bin; 755; root; root;sysdir /etc/telegraf; 755; root; root;sysdir -/opt/microsoft/omsagent/plugin/lib; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights/channel; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir -/opt/microsoft/omsagent/plugin/lib/application_insights/rack; 755; root; root; sysdir +/etc/fluent; 755; root; root; sysdir +/etc/fluent/plugin; 755; root; root; sysdir +/etc/fluent/plugin/health; 755; root; root; sysdir +/etc/fluent/plugin/lib; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights/channel; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights/channel/contracts; 755; root; root; sysdir +/etc/fluent/plugin/lib/application_insights/rack; 755; root; root; sysdir /opt/tomlrb; 755; root; root; sysdir @@ -230,64 +228,61 @@ WriteInstallInfo() { } WriteInstallInfo -#Make omsagent owner for ContainerInventory directory. This is needed for ruby plugin to have access -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/ContainerInventory # Get the state file in place with proper permissions touch /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt chmod 644 /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/LastEventQueryTime.txt touch /var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml chmod 644 /var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml touch /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml chmod 644 /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml -chown omsagent:omsagent /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml + touch /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt chmod 666 /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt + touch /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt chmod 666 /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt + touch /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log + touch /var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log + touch /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log + touch /var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log chmod 666 /var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log + touch /var/opt/microsoft/docker-cimprov/log/health_monitors.log chmod 666 /var/opt/microsoft/docker-cimprov/log/health_monitors.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/health_monitors.log + touch /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log chmod 666 /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log + touch /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log + touch /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log chmod 666 /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log -chown omsagent:omiusers /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log -mv /etc/opt/microsoft/docker-cimprov/container.conf /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf -chown omsagent:omsagent /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf + +touch /var/opt/microsoft/docker-cimprov/log/fluentd.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/fluentd.log + %Postuninstall_10 # If we're an upgrade, skip all of this cleanup @@ -299,7 +294,6 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rm -f /var/opt/microsoft/docker-cimprov/state/KubeLogQueryState.yaml rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt rm -f /var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt - rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/container.conf rmdir /var/opt/microsoft/docker-cimprov/log 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ContainerInventory 2> /dev/null rmdir /var/opt/microsoft/docker-cimprov/state/ImageInventory 2> /dev/null @@ -308,14 +302,7 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rmdir /etc/opt/microsoft/docker-cimprov/conf 2> /dev/null rmdir /etc/opt/microsoft/docker-cimprov 2> /dev/null rmdir /etc/opt/microsoft 2> /dev/null - rmdir /etc/opt 2> /dev/null - #Remove sudoers file edit - if [ -s /etc/sudoers.d/omsagent ] - then - chmod +w /etc/sudoers.d/omsagent - sed -i '/docker\-provider/,+1 d' /etc/sudoers.d/omsagent - chmod 440 /etc/sudoers.d/omsagent - fi + rmdir /etc/opt 2> /dev/null fi %Preinstall_0 diff --git a/build/linux/installer/datafiles/linux.data b/build/linux/installer/datafiles/linux.data index 604394d80..48af63a73 100644 --- a/build/linux/installer/datafiles/linux.data +++ b/build/linux/installer/datafiles/linux.data @@ -1,16 +1,11 @@ %Variables PF: 'Linux' -OMI_SERVICE: '/opt/omi/bin/service_control' -OMS_SERVICE: '/opt/microsoft/omsagent/bin/service_control' + %Postinstall_2000 -# Reload the OMI server -${{OMI_SERVICE}} reload -${{OMS_SERVICE}} reload -if ${{PERFORMING_UPGRADE_NOT}}; then - /opt/omi/bin/omicli ei root/cimv2 Container_HostInventory -fi + + %Postuninstall_1000 # Calling sequence for RPM pre/post scripts, during upgrade, is as follows: @@ -35,10 +30,5 @@ if ${{PERFORMING_UPGRADE_NOT}}; then fi %Postuninstall_1100 -# If we're called for upgrade, don't do anything -if ${{PERFORMING_UPGRADE_NOT}}; then - # Reload the OMI server - ${{OMI_SERVICE}} reload - ${{OMS_SERVICE}} reload -fi + diff --git a/build/linux/installer/datafiles/linux_dpkg.data b/build/linux/installer/datafiles/linux_dpkg.data index a7821642d..bdf9f2354 100644 --- a/build/linux/installer/datafiles/linux_dpkg.data +++ b/build/linux/installer/datafiles/linux_dpkg.data @@ -3,5 +3,5 @@ PERFORMING_UPGRADE_NOT: '[ "$1" != "upgrade" ]' PACKAGE_TYPE: 'DPKG' %Dependencies -omi (>= 1.0.8.6) + diff --git a/build/linux/installer/datafiles/linux_rpm.data b/build/linux/installer/datafiles/linux_rpm.data index 1b9ba009b..d537b444d 100644 --- a/build/linux/installer/datafiles/linux_rpm.data +++ b/build/linux/installer/datafiles/linux_rpm.data @@ -3,5 +3,5 @@ PERFORMING_UPGRADE_NOT: '[ "$1" -ne 1 ]' PACKAGE_TYPE: 'RPM' %Dependencies -omi >= 1.0.8-6 + diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 5e1261e7e..252f471e9 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -1,19 +1,21 @@ #!/bin/bash -#test to exit non zero value if omsagent is not running -(ps -ef | grep omsagent- | grep -v "grep") +#test to exit non zero value if mdsd is not running +(ps -ef | grep "mdsd" | grep -v "grep") if [ $? -ne 0 ] then - echo " omsagent is not running" > /dev/termination-log - exit 1 + echo "mdsd is not running" > /dev/termination-log + exit 1 fi -#optionally test to exit non zero value if oneagent is not running -if [ -e "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" ]; then - (ps -ef | grep "mdsd" | grep -v "grep") + +#optionally test to exit non zero value if fluentd is not running +#fluentd not used in sidecar container +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + (ps -ef | grep "fluentd" | grep -v "grep") if [ $? -ne 0 ] then - echo "oneagent is not running" > /dev/termination-log + echo "fluentd is not running" > /dev/termination-log exit 1 fi fi diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 5ce5d79d2..dcf179bf2 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -3,7 +3,7 @@ require_relative "tomlrb" require_relative "ConfigParseErrorLogger" -require_relative "microsoft/omsagent/plugin/constants" +require_relative "/etc/fluent/plugin/constants" @configMapMountPath = "/etc/config/settings/alertable-metrics-configuration-settings" @configVersion = "" diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb index 40d87b7f1..cee41312b 100644 --- a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -3,7 +3,7 @@ require_relative "tomlrb" require_relative "ConfigParseErrorLogger" -require_relative "microsoft/omsagent/plugin/constants" +require_relative "/etc/fluent/plugin/constants" @configMapMountPath = "/etc/config/settings/metric_collection_settings" @configVersion = "" diff --git a/kubernetes/linux/envmdsd b/kubernetes/linux/envmdsd index 3f834bfb8..5a939fc3e 100644 --- a/kubernetes/linux/envmdsd +++ b/kubernetes/linux/envmdsd @@ -2,8 +2,6 @@ export MDSD_ROLE_PREFIX="/var/run/mdsd/default" #export MDSD_OPTIONS="-d -A -r ${MDSD_ROLE_PREFIX}" export MDSD_LOG="/var/opt/microsoft/linuxmonagent/log" export MDSD_SPOOL_DIRECTORY="/var/opt/microsoft/linuxmonagent" -export OMS_CERT_PATH="/etc/opt/microsoft/omsagent/certs/oms.crt" -export OMS_CERT_KEY_PATH="/etc/opt/microsoft/omsagent/certs/oms.key" #export CIWORKSPACE_id="" #export CIWORKSPACE_key="" export MDSD_OPTIONS="-A -c /etc/mdsd.d/mdsd.xml -r ${MDSD_ROLE_PREFIX} -S ${MDSD_SPOOL_DIRECTORY}/eh -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos" diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c7d939034..b21ed6b96 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -38,41 +38,9 @@ waitforlisteneronTCPport() { fi } -if [ -e "/etc/config/kube.conf" ]; then - cat /etc/config/kube.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf -elif [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "setting omsagent conf file for prometheus sidecar" - cat /etc/opt/microsoft/docker-cimprov/prometheus-side-car.conf > /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf - # omsadmin.sh replaces %MONITOR_AGENT_PORT% and %SYSLOG_PORT% in the monitor.conf and syslog.conf with default ports 25324 and 25224. - # Since we are running 2 omsagents in the same pod, we need to use a different port for the sidecar, - # else we will see the Address already in use - bind(2) for 0.0.0.0:253(2)24 error. - # Look into omsadmin.sh scripts's configure_monitor_agent()/configure_syslog() and find_available_port() methods for more info. - sed -i -e 's/port %MONITOR_AGENT_PORT%/port 25326/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/monitor.conf - sed -i -e 's/port %SYSLOG_PORT%/port 25226/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf -else - echo "setting omsagent conf file for daemonset" - sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/container.conf -fi -sed -i -e 's/bind 127.0.0.1/bind 0.0.0.0/g' /etc/opt/microsoft/omsagent/sysconf/omsagent.d/syslog.conf -sed -i -e 's/^exit 101$/exit 0/g' /usr/sbin/policy-rc.d - -#Using the get_hostname for hostname instead of the host field in syslog messages -sed -i.bak "s/record\[\"Host\"\] = hostname/record\[\"Host\"\] = OMS::Common.get_hostname/" /opt/microsoft/omsagent/plugin/filter_syslog.rb - #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state -#if [ ! -e "/etc/config/kube.conf" ]; then - # add permissions for omsagent user to access docker.sock - #sudo setfacl -m user:omsagent:rw /var/run/host/docker.sock -#fi - -# add permissions for omsagent user to access azure.json. -sudo setfacl -m user:omsagent:r /etc/kubernetes/host/azure.json - -# add permission for omsagent user to log folder. We also need 'x', else log rotation is failing. TODO: Investigate why. -sudo setfacl -m user:omsagent:rwx /var/opt/microsoft/docker-cimprov/log - #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' @@ -89,7 +57,7 @@ else export customResourceId=$AKS_RESOURCE_ID echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc source ~/.bashrc - echo "customResourceId:$customResourceId" + echo "customResourceId:$customResourceId" fi #set agent config schema version @@ -141,7 +109,6 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus fi export PROXY_ENDPOINT="" - # Check for internet connectivity or workspace deletion if [ -e "/etc/omsagent-secret/WSID" ]; then workspaceId=$(cat /etc/omsagent-secret/WSID) @@ -222,6 +189,7 @@ else echo "LA Onboarding:Workspace Id not mounted, skipping the telemetry check" fi + # Set environment variable for if public cloud by checking the workspace domain. if [ -z $domain ]; then ClOUD_ENVIRONMENT="unknown" @@ -233,6 +201,12 @@ fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc +#consisten naming conventions with the windows +export DOMAIN=$domain +echo "export DOMAIN=$DOMAIN" >> ~/.bashrc +export WSID=$workspaceId +echo "export WSID=$WSID" >> ~/.bashrc + # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) for BACKOFF in {1..4}; do @@ -267,7 +241,7 @@ source ~/.bashrc if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then #Parse the configmap to set the right environment variables. - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser.rb + /usr/bin/ruby2.6 tomlparser.rb cat config_env_var | while read line; do echo $line >> ~/.bashrc @@ -278,7 +252,7 @@ fi #Parse the configmap to set the right environment variables for agent config. #Note > tomlparser-agent-config.rb has to be parsed first before td-agent-bit-conf-customizer.rb for fbit agent settings if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-agent-config.rb + /usr/bin/ruby2.6 tomlparser-agent-config.rb cat agent_config_env_var | while read line; do #echo $line @@ -287,7 +261,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then source agent_config_env_var #Parse the configmap to set the right environment variables for network policy manager (npm) integration. - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-npm-config.rb + /usr/bin/ruby2.6 tomlparser-npm-config.rb cat integration_npm_config_env_var | while read line; do #echo $line @@ -298,11 +272,11 @@ fi #Replace the placeholders in td-agent-bit.conf file for fluentbit with custom/default values in daemonset if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - /opt/microsoft/omsagent/ruby/bin/ruby td-agent-bit-conf-customizer.rb + /usr/bin/ruby2.6 td-agent-bit-conf-customizer.rb fi #Parse the prometheus configmap to create a file with new custom settings. -/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-prom-customconfig.rb +/usr/bin/ruby2.6 tomlparser-prom-customconfig.rb #Setting default environment variables to be used in any case of failure in the above steps if [ ! -e "/etc/config/kube.conf" ]; then @@ -335,7 +309,7 @@ fi #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-mdm-metrics-config.rb + /usr/bin/ruby2.6 tomlparser-mdm-metrics-config.rb cat config_mdm_metrics_env_var | while read line; do echo $line >> ~/.bashrc @@ -343,7 +317,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then source config_mdm_metrics_env_var #Parse the configmap to set the right environment variables for metric collection settings - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + /usr/bin/ruby2.6 tomlparser-metric-collection-config.rb cat config_metric_collection_env_var | while read line; do echo $line >> ~/.bashrc @@ -354,7 +328,7 @@ fi # OSM scraping to be done in replicaset if sidecar car scraping is disabled and always do the scraping from the sidecar (It will always be either one of the two) if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || ( ( -e "/etc/config/kube.conf" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ]]; then - /opt/microsoft/omsagent/ruby/bin/ruby tomlparser-osm-config.rb + /usr/bin/ruby2.6 tomlparser-osm-config.rb if [ -e "integration_osm_config_env_var" ]; then cat integration_osm_config_env_var | while read line; do @@ -432,26 +406,11 @@ export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_error if [ "$CONTAINER_RUNTIME" != "docker" ]; then # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" - export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" -else - #if container run time is docker then add omsagent user to local docker group to get access to docker.sock - # docker.sock only use for the telemetry to get the docker version - DOCKER_SOCKET=/var/run/host/docker.sock - DOCKER_GROUP=docker - REGULAR_USER=omsagent - if [ -S ${DOCKER_SOCKET} ]; then - echo "getting gid for docker.sock" - DOCKER_GID=$(stat -c '%g' ${DOCKER_SOCKET}) - echo "creating a local docker group" - groupadd -for -g ${DOCKER_GID} ${DOCKER_GROUP} - echo "adding omsagent user to local docker group" - usermod -aG ${DOCKER_GROUP} ${REGULAR_USER} - fi + export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" fi echo "set caps for ruby process to read container env from proc" -sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /opt/microsoft/omsagent/ruby/bin/ruby - +sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /usr/bin/ruby2.6 echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >> ~/.bashrc echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >> ~/.bashrc @@ -461,171 +420,70 @@ echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname #check if file was written successfully. cat /var/opt/microsoft/docker-cimprov/state/containerhostname - -#Commenting it for test. We do this in the installer now -#Setup sudo permission for containerlogtailfilereader -#chmod +w /etc/sudoers.d/omsagent -#echo "#run containerlogtailfilereader.rb for docker-provider" >> /etc/sudoers.d/omsagent -#echo "omsagent ALL=(ALL) NOPASSWD: /opt/microsoft/omsagent/ruby/bin/ruby /opt/microsoft/omsagent/plugin/containerlogtailfilereader.rb *" >> /etc/sudoers.d/omsagent -#chmod 440 /etc/sudoers.d/omsagent - -#Disable dsc -#/opt/microsoft/omsconfig/Scripts/OMS_MetaConfigHelper.py --disable -rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/omsconfig.consistencyinvoker.conf - -CIWORKSPACE_id="" -CIWORKSPACE_key="" - -if [ -z $INT ]; then - if [ -a /etc/omsagent-secret/PROXY ]; then - if [ -a /etc/omsagent-secret/DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -d `cat /etc/omsagent-secret/DOMAIN` -p `cat /etc/omsagent-secret/PROXY` - else - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -p `cat /etc/omsagent-secret/PROXY` - fi - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - elif [ -a /etc/omsagent-secret/DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -d `cat /etc/omsagent-secret/DOMAIN` - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - elif [ -a /etc/omsagent-secret/WSID ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - elif [ -a /run/secrets/DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /run/secrets/WSID` -s `cat /run/secrets/KEY` -d `cat /run/secrets/DOMAIN` - CIWORKSPACE_id="$(cat /run/secrets/WSID)" - CIWORKSPACE_key="$(cat /run/secrets/KEY)" - elif [ -a /run/secrets/WSID ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /run/secrets/WSID` -s `cat /run/secrets/KEY` - CIWORKSPACE_id="$(cat /run/secrets/WSID)" - CIWORKSPACE_key="$(cat /run/secrets/KEY)" - elif [ -z $DOMAIN ]; then - /opt/microsoft/omsagent/bin/omsadmin.sh -w $WSID -s $KEY - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - CIWORKSPACE_key="$(cat /etc/omsagent-secret/KEY)" - else - /opt/microsoft/omsagent/bin/omsadmin.sh -w $WSID -s $KEY -d $DOMAIN - CIWORKSPACE_id="$WSID" - CIWORKSPACE_key="$KEY" - fi -else -#To onboard to INT workspace - workspace-id (WSID-not base64 encoded), workspace-key (KEY-not base64 encoded), Domain(DOMAIN-int2.microsoftatlanta-int.com) -#need to be added to omsagent.yaml. - echo WORKSPACE_ID=$WSID > /etc/omsagent-onboard.conf - echo SHARED_KEY=$KEY >> /etc/omsagent-onboard.conf - echo URL_TLD=$DOMAIN >> /etc/omsagent-onboard.conf - /opt/microsoft/omsagent/bin/omsadmin.sh - CIWORKSPACE_id="$WSID" - CIWORKSPACE_key="$KEY" -fi - #start cron daemon for logrotate service cron start +#get docker-provider versions -#check if agent onboarded successfully -/opt/microsoft/omsagent/bin/omsadmin.sh -l - -#get omsagent and docker-provider versions -dpkg -l | grep omsagent | awk '{print $2 " " $3}' dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +echo "*** activating oneagent in legacy auth mode ***" +CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" +#use the file path as its secure than env +CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" +cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc +done +source /etc/mdsd.d/envmdsd +echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" +export CIWORKSPACE_id=$CIWORKSPACE_id +echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc +export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile +echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc +export OMS_TLD=$domain +echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc +export MDSD_FLUENT_SOCKET_PORT="29230" +echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + +#skip imds lookup since not used in legacy auth path +export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" +echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc -#region check to auto-activate oneagent, to route container logs, -#Intent is to activate one agent routing for all managed clusters with region in the regionllist, unless overridden by configmap -# AZMON_CONTAINER_LOGS_ROUTE will have route (if any) specified in the config map -# AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE will have the final route that we compute & set, based on our region list logic -echo "************start oneagent log routing checks************" -# by default, use configmap route for safer side -AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE - -#trim region list -oneagentregions="$(echo $AZMON_CONTAINERLOGS_ONEAGENT_REGIONS | xargs)" -#lowercase region list -typeset -l oneagentregions=$oneagentregions -echo "oneagent regions: $oneagentregions" -#trim current region -currentregion="$(echo $AKS_REGION | xargs)" -#lowercase current region -typeset -l currentregion=$currentregion -echo "current region: $currentregion" - -#initilze isoneagentregion as false -isoneagentregion=false - -#set isoneagentregion as true if matching region is found -if [ ! -z $oneagentregions ] && [ ! -z $currentregion ]; then - for rgn in $(echo $oneagentregions | sed "s/,/ /g"); do - if [ "$rgn" == "$currentregion" ]; then - isoneagentregion=true - echo "current region is in oneagent regions..." - break - fi - done -else - echo "current region is not in oneagent regions..." -fi +source ~/.bashrc -if [ "$isoneagentregion" = true ]; then - #if configmap has a routing for logs, but current region is in the oneagent region list, take the configmap route - if [ ! -z $AZMON_CONTAINER_LOGS_ROUTE ]; then - AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$AZMON_CONTAINER_LOGS_ROUTE - echo "oneagent region is true for current region:$currentregion and config map logs route is not empty. so using config map logs route as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" - else #there is no configmap route, so route thru oneagent - AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE="v2" - echo "oneagent region is true for current region:$currentregion and config map logs route is empty. so using oneagent as effective route:$AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" - fi -else - echo "oneagent region is false for current region:$currentregion" +dpkg -l | grep mdsd | awk '{print $2 " " $3}' + +if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." + #use tenant name to avoid unix socket conflict and different ports for port conflict + #roleprefix to use container specific mdsd socket + export TENANT_NAME="${CONTAINER_TYPE}" + echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc + export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default + echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc + source ~/.bashrc + mkdir /var/run/mdsd-${CONTAINER_TYPE} + # add -T 0xFFFF for full traces + mdsd -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & +else + echo "starting mdsd in legacy auth mode in main container..." + # add -T 0xFFFF for full traces + mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & fi - -#start oneagent -if [ ! -e "/etc/config/kube.conf" ] && [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then - if [ ! -z $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE ]; then - echo "container logs configmap route is $AZMON_CONTAINER_LOGS_ROUTE" - echo "container logs effective route is $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE" - #trim - containerlogsroute="$(echo $AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE | xargs)" - # convert to lowercase - typeset -l containerlogsroute=$containerlogsroute - - echo "setting AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE as :$containerlogsroute" - export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute - echo "export AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE=$containerlogsroute" >> ~/.bashrc - source ~/.bashrc - - if [ "$containerlogsroute" == "v2" ]; then - echo "activating oneagent..." - echo "configuring mdsd..." - cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc - done - source /etc/mdsd.d/envmdsd - - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" - export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc - export CIWORKSPACE_key=$CIWORKSPACE_key - echo "export CIWORKSPACE_key=$CIWORKSPACE_key" >> ~/.bashrc - - source ~/.bashrc - - dpkg -l | grep mdsd | awk '{print $2 " " $3}' - - echo "starting mdsd ..." - mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - - touch /opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2 - fi - fi -fi -echo "************end oneagent log routing checks************" +# no dependency on fluentd for prometheus side car container +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then + if [ ! -e "/etc/config/kube.conf" ]; then + echo "*** starting fluentd v1 in daemonset" + fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log & + else + echo "*** starting fluentd v1 in replicaset" + fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log & + fi +fi #If config parsing was successful, a copy of the conf file with replaced custom settings file is created if [ ! -e "/etc/config/kube.conf" ]; then @@ -749,12 +607,9 @@ dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' #dpkg -l | grep telegraf | awk '{print $2 " " $3}' - - # Write messages from the liveness probe to stdout (so telemetry picks it up) touch /dev/write-to-traces - echo "stopping rsyslog..." service rsyslog stop @@ -762,7 +617,7 @@ echo "getting rsyslog status..." service rsyslog status shutdown() { - /opt/microsoft/omsagent/bin/service_control stop + pkill -f mdsd } trap "shutdown" SIGTERM diff --git a/kubernetes/linux/mdsd.xml b/kubernetes/linux/mdsd.xml index 49d329791..de14240aa 100644 --- a/kubernetes/linux/mdsd.xml +++ b/kubernetes/linux/mdsd.xml @@ -47,6 +47,149 @@ Each column has a name, an augmented JSON source type, and a target MDS type. --> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -68,14 +211,33 @@ + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - ]]> @@ -143,11 +360,95 @@ - - ]]> + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + + ]]> + + + + + + + ]]> + + + + + + + ]]> + + + diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index f065cc165..3d00e4c57 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -9,37 +9,13 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent_v1.10.0-1/omsagent-1.10.0-1.universal.x64.sh +#install oneagent - Official bits (05/17/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/05172021-oneagent/azure-mdsd_1.10.1-build.master.213_x86_64.deb -#create file to disable omi service startup script -touch /etc/.omi_disable_service_control - -chmod 775 $TMPDIR/*.sh - -#Extract omsbundle -$TMPDIR/omsagent-*.universal.x64.sh --extract -mv $TMPDIR/omsbundle* $TMPDIR/omsbundle -#Install omi -/usr/bin/dpkg -i $TMPDIR/omsbundle/110/omi*.deb - -#Install scx -/usr/bin/dpkg -i $TMPDIR/omsbundle/110/scx*.deb -#$TMPDIR/omsbundle/bundles/scx-1.6.*-*.universal.x64.sh --install - -#Install omsagent - -/usr/bin/dpkg -i $TMPDIR/omsbundle/110/omsagent*.deb -#/usr/bin/dpkg -i $TMPDIR/omsbundle/100/omsconfig*.deb - -#install oneagent - Official bits (05/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/05112021-oneagent/azure-mdsd_1.8.0-build.master.189_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d cp -f $TMPDIR/envmdsd /etc/mdsd.d -#Assign permissions to omsagent user to access docker.sock -sudo apt-get install acl - #download inotify tools for watching configmap changes sudo apt-get update sudo apt-get install inotify-tools -y @@ -49,18 +25,7 @@ sudo apt-get install inotify-tools -y sudo apt-get install jq=1.5+dfsg-2 -y #used to setcaps for ruby process to read /proc/env -echo "installing libcap2-bin" sudo apt-get install libcap2-bin -y -#/$TMPDIR/omsbundle/oss-kits/docker-cimprov-1.0.0-*.x86_64.sh --install -#Use downloaded docker-provider instead of the bundled one - -#download and install telegraf -#wget https://dl.influxdata.com/telegraf/releases/telegraf_1.10.1-1_amd64.deb -#sudo dpkg -i telegraf_1.10.1-1_amd64.deb - -#service telegraf stop - -#wget https://github.com/microsoft/Docker-Provider/releases/download/5.0.0.0/telegraf #1.18 pre-release wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz @@ -79,8 +44,17 @@ sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/ sudo apt-get update sudo apt-get install td-agent-bit=1.6.8 -y -rm -rf $TMPDIR/omsbundle -rm -f $TMPDIR/omsagent*.sh +# install ruby2.6 +sudo apt-get install software-properties-common -y +sudo apt-add-repository ppa:brightbox/ruby-ng -y +sudo apt-get update +sudo apt-get install ruby2.6 ruby2.6-dev gcc make -y +# fluentd v1 gem +gem install fluentd -v "1.12.2" --no-document +fluentd --setup ./fluent +gem install gyoku iso8601 --no-doc + + rm -f $TMPDIR/docker-cimprov*.sh rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index d35acad3d..25f364c55 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -92,15 +92,24 @@ const kubeMonAgentConfigEventFlushInterval = 60 //Eventsource name in mdsd const MdsdContainerLogSourceName = "ContainerLogSource" const MdsdContainerLogV2SourceName = "ContainerLogV2Source" +const MdsdKubeMonAgentEventsSourceName = "KubeMonAgentEventsSource" +const MdsdInsightsMetricsSourceName = "InsightsMetricsSource" -//container logs route (v2=flush to oneagent, adx= flush to adx ingestion, anything else flush to ODS[default]) +//container logs route (v2=flush to oneagent, adx= flush to adx ingestion, v1 for ODS Direct) const ContainerLogsV2Route = "v2" const ContainerLogsADXRoute = "adx" +//fallback option v1 route i.e. ODS direct if required in any case +const ContainerLogsV1Route = "v1" + //container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) const ContainerLogV2SchemaVersion = "v2" + +//env variable to container type +const ContainerTypeEnv = "CONTAINER_TYPE" + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -108,6 +117,10 @@ var ( HTTPClient http.Client // Client for MDSD msgp Unix socket MdsdMsgpUnixSocketClient net.Conn + // Client for MDSD msgp Unix socket for KubeMon Agent events + MdsdKubeMonMsgpUnixSocketClient net.Conn + // Client for MDSD msgp Unix socket for Insights Metrics + MdsdInsightsMetricsMsgpUnixSocketClient net.Conn // Ingestor for ADX ADXIngestor *ingest.Ingestion // OMSEndpoint ingestion endpoint @@ -116,6 +129,8 @@ var ( Computer string // WorkspaceID log analytics workspace id WorkspaceID string + // LogAnalyticsWorkspaceDomain log analytics workspace domain + LogAnalyticsWorkspaceDomain string // ResourceID for resource-centric log analytics data ResourceID string // Resource-centric flag (will be true if we determine if above RseourceID is non-empty - default is false) @@ -143,7 +158,17 @@ var ( // ADX tenantID AdxTenantID string //ADX client secret - AdxClientSecret string + AdxClientSecret string + // container log or container log v2 tag name for oneagent route + MdsdContainerLogTagName string + // kubemonagent events tag name for oneagent route + MdsdKubeMonAgentEventsTagName string + // InsightsMetrics tag name for oneagent route + MdsdInsightsMetricsTagName string + // flag to check if its Windows OS + IsWindows bool + // container type + ContainerType string ) var ( @@ -314,6 +339,15 @@ const ( PromScrapingError ) +// DataType to be used as enum per data type socket client creation +type DataType int +const ( + // DataType to be used as enum per data type socket client creation + ContainerLogV2 DataType = iota + KubeMonAgentEvents + InsightsMetrics +) + func createLogger() *log.Logger { var logfile *os.File @@ -532,6 +566,7 @@ func flushKubeMonAgentEventRecords() { start := time.Now() var elapsed time.Duration var laKubeMonAgentEventsRecords []laKubeMonAgentEvents + var msgPackEntries []MsgPackEntry telemetryDimensions := make(map[string]string) telemetryDimensions["ConfigErrorEventCount"] = strconv.Itoa(len(ConfigErrorEvent)) @@ -558,7 +593,25 @@ func flushKubeMonAgentEventRecords() { Message: k, Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + var stringMap map[string]string + jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) + if err != nil { + message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) + Log(message) + SendException(message) + } else { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + message := fmt.Sprintf("Error while UnMarhalling json bytes to stringmap: %s", err.Error()) + Log(message) + SendException(message) + } else { + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } @@ -579,7 +632,25 @@ func flushKubeMonAgentEventRecords() { Message: k, Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + var stringMap map[string]string + jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) + if err != nil { + message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) + Log(message) + SendException(message) + } else { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + message := fmt.Sprintf("Error while UnMarhalling json bytes to stringmap: %s", err.Error()) + Log(message) + SendException(message) + } else { + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } @@ -610,11 +681,63 @@ func flushKubeMonAgentEventRecords() { Message: "No errors", Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + var stringMap map[string]string + jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) + if err != nil { + message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) + Log(message) + SendException(message) + } else { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + message := fmt.Sprintf("Error while UnMarshalling json bytes to stringmap: %s", err.Error()) + Log(message) + SendException(message) + } else { + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } - - if len(laKubeMonAgentEventsRecords) > 0 { + if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + Log("Info::mdsd:: using mdsdsource name for KubeMonAgentEvents: %s", MdsdKubeMonAgentEventsTagName) + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdKubeMonAgentEventsTagName, msgPackEntries) + if MdsdKubeMonMsgpUnixSocketClient == nil { + Log("Error::mdsd::mdsd connection for KubeMonAgentEvents does not exist. re-connecting ...") + CreateMDSDClient(KubeMonAgentEvents, ContainerType) + if MdsdKubeMonMsgpUnixSocketClient == nil { + Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + KubeMonEventsMDSDClientCreateErrors += 1 + } + } + if MdsdKubeMonMsgpUnixSocketClient != nil { + deadline := 10 * time.Second + MdsdKubeMonMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + bts, er := MdsdKubeMonMsgpUnixSocketClient.Write(msgpBytes) + elapsed = time.Since(start) + if er != nil { + message := fmt.Sprintf("Error::mdsd::Failed to write to kubemonagent mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) + Log(message) + if MdsdKubeMonMsgpUnixSocketClient != nil { + MdsdKubeMonMsgpUnixSocketClient.Close() + MdsdKubeMonMsgpUnixSocketClient = nil + } + SendException(message) + } else { + numRecords := len(msgPackEntries) + Log("FlushKubeMonAgentEventRecords::Info::Successfully flushed %d records that was %d bytes in %s", numRecords, bts, elapsed) + // Send telemetry to AppInsights resource + SendEvent(KubeMonAgentEventsFlushedEvent, telemetryDimensions) + } + } else { + Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") + } + } else if len(laKubeMonAgentEventsRecords) > 0 { //for windows, ODS direct kubeMonAgentEventEntry := KubeMonAgentEventBlob{ DataType: KubeMonAgentEventDataType, IPName: IPName, @@ -746,70 +869,144 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int message := fmt.Sprintf("PostTelegrafMetricsToLA::Info:derived %v metrics from %v timeseries", len(laMetrics), len(telegrafRecords)) Log(message) } + + if IsWindows == false { //for linux, mdsd route + var msgPackEntries []MsgPackEntry + var i int + start := time.Now() + var elapsed time.Duration + + for i = 0; i < len(laMetrics); i++ { + var interfaceMap map[string]interface{} + stringMap := make(map[string]string) + jsonBytes, err := json.Marshal(*laMetrics[i]) + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } else { + if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { + message := fmt.Sprintf("Error while UnMarshalling json bytes to interfaceMap: %s", err.Error()) + Log(message) + SendException(message) + return output.FLB_OK + } else { + for key, value := range interfaceMap { + strKey := fmt.Sprintf("%v", key) + strValue := fmt.Sprintf("%v", value) + stringMap[strKey] = strValue + } + msgPackEntry := MsgPackEntry{ + Record: stringMap, + } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } + } + if (len(msgPackEntries) > 0) { + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) + if MdsdInsightsMetricsMsgpUnixSocketClient == nil { + Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") + CreateMDSDClient(InsightsMetrics, ContainerType) + if MdsdInsightsMetricsMsgpUnixSocketClient == nil { + Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + InsightsMetricsMDSDClientCreateErrors += 1 + return output.FLB_RETRY + } + } - var metrics []laTelegrafMetric - var i int + deadline := 10 * time.Second + MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + bts, er := MdsdInsightsMetricsMsgpUnixSocketClient.Write(msgpBytes) - for i = 0; i < len(laMetrics); i++ { - metrics = append(metrics, *laMetrics[i]) - } + elapsed = time.Since(start) - laTelegrafMetrics := InsightsMetricsBlob{ - DataType: InsightsMetricsDataType, - IPName: IPName, - DataItems: metrics} + if er != nil { + Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) + if MdsdInsightsMetricsMsgpUnixSocketClient != nil { + MdsdInsightsMetricsMsgpUnixSocketClient.Close() + MdsdInsightsMetricsMsgpUnixSocketClient = nil + } - jsonBytes, err := json.Marshal(laTelegrafMetrics) + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + InsightsMetricsMDSDClientCreateErrors += 1 + return output.FLB_RETRY + } else { + numTelegrafMetricsRecords := len(msgPackEntries) + Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) + } + } + + } else { // for windows, ODS direct - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) - Log(message) - SendException(message) - return output.FLB_OK - } + var metrics []laTelegrafMetric + var i int - //Post metrics data to LA - req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) + for i = 0; i < len(laMetrics); i++ { + metrics = append(metrics, *laMetrics[i]) + } - //req.URL.Query().Add("api-version","2016-04-01") + laTelegrafMetrics := InsightsMetricsBlob{ + DataType: InsightsMetricsDataType, + IPName: IPName, + DataItems: metrics} - //set headers - req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) - req.Header.Set("User-Agent", userAgent) - reqID := uuid.New().String() - req.Header.Set("X-Request-ID", reqID) + jsonBytes, err := json.Marshal(laTelegrafMetrics) - //expensive to do string len for every request, so use a flag - if ResourceCentric == true { - req.Header.Set("x-ms-AzureResourceId", ResourceID) - } + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } - start := time.Now() - resp, err := HTTPClient.Do(req) - elapsed := time.Since(start) + //Post metrics data to LA + req, _ := http.NewRequest("POST", OMSEndpoint, bytes.NewBuffer(jsonBytes)) - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) - Log(message) - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) - return output.FLB_RETRY - } + //req.URL.Query().Add("api-version","2016-04-01") - if resp == nil || resp.StatusCode != 200 { - if resp != nil { - Log("PostTelegrafMetricsToLA::Error:(retriable) RequestID %s Response Status %v Status Code %v", reqID, resp.Status, resp.StatusCode) + //set headers + req.Header.Set("x-ms-date", time.Now().Format(time.RFC3339)) + req.Header.Set("User-Agent", userAgent) + reqID := uuid.New().String() + req.Header.Set("X-Request-ID", reqID) + + //expensive to do string len for every request, so use a flag + if ResourceCentric == true { + req.Header.Set("x-ms-AzureResourceId", ResourceID) } - if resp != nil && resp.StatusCode == 429 { - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1) + + start := time.Now() + resp, err := HTTPClient.Do(req) + elapsed := time.Since(start) + + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) + Log(message) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) + return output.FLB_RETRY + } + + if resp == nil || resp.StatusCode != 200 { + if resp != nil { + Log("PostTelegrafMetricsToLA::Error:(retriable) RequestID %s Response Status %v Status Code %v", reqID, resp.Status, resp.StatusCode) + } + if resp != nil && resp.StatusCode == 429 { + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1) + } + return output.FLB_RETRY } - return output.FLB_RETRY - } - defer resp.Body.Close() + defer resp.Body.Close() - numMetrics := len(laMetrics) - UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0) - Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) + numMetrics := len(laMetrics) + UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0) + Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) + } return output.FLB_OK } @@ -986,13 +1183,9 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords := 0 if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { - //flush to mdsd - mdsdSourceName := MdsdContainerLogSourceName - if (ContainerLogSchemaV2 == true) { - mdsdSourceName = MdsdContainerLogV2SourceName - } + //flush to mdsd fluentForward := MsgPackForward{ - Tag: mdsdSourceName, + Tag: MdsdContainerLogTagName, Entries: msgPackEntries, } @@ -1019,7 +1212,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if MdsdMsgpUnixSocketClient == nil { Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") - CreateMDSDClient() + CreateMDSDClient(ContainerLogV2, ContainerType) if MdsdMsgpUnixSocketClient == nil { Log("Error::mdsd::Unable to create mdsd client. Please check error log.") @@ -1286,21 +1479,31 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { log.Fatalln(message) } - osType := os.Getenv("OS_TYPE") + ContainerType = os.Getenv(ContainerTypeEnv) + Log("Container Type %s", ContainerType) + osType := os.Getenv("OS_TYPE") + IsWindows = false // Linux if strings.Compare(strings.ToLower(osType), "windows") != 0 { Log("Reading configuration for Linux from %s", pluginConfPath) - omsadminConf, err := ReadConfiguration(pluginConfig["omsadmin_conf_path"]) - if err != nil { - message := fmt.Sprintf("Error Reading omsadmin configuration %s\n", err.Error()) + WorkspaceID = os.Getenv("WSID") + if WorkspaceID == "" { + message := fmt.Sprintf("WorkspaceID shouldnt be empty") Log(message) SendException(message) time.Sleep(30 * time.Second) log.Fatalln(message) } - OMSEndpoint = omsadminConf["OMS_ENDPOINT"] - WorkspaceID = omsadminConf["WORKSPACE_ID"] + LogAnalyticsWorkspaceDomain = os.Getenv("DOMAIN") + if LogAnalyticsWorkspaceDomain == "" { + message := fmt.Sprintf("Workspace DOMAIN shouldnt be empty") + Log(message) + SendException(message) + time.Sleep(30 * time.Second) + log.Fatalln(message) + } + OMSEndpoint = "https://" + WorkspaceID + ".ods." + LogAnalyticsWorkspaceDomain + "/OperationalData.svc/PostJsonDataItems" // Populate Computer field containerHostName, err1 := ioutil.ReadFile(pluginConfig["container_host_file_path"]) if err1 != nil { @@ -1329,6 +1532,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } } else { // windows + IsWindows = true Computer = os.Getenv("HOSTNAME") WorkspaceID = os.Getenv("WSID") logAnalyticsDomain := os.Getenv("DOMAIN") @@ -1410,21 +1614,15 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log(message) } - PluginConfiguration = pluginConfig - - CreateHTTPClient() + PluginConfiguration = pluginConfig - ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE"))) - Log("AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE:%s", ContainerLogsRoute) + ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_ROUTE"))) + Log("AZMON_CONTAINER_LOGS_ROUTE:%s", ContainerLogsRoute) - ContainerLogsRouteV2 = false //default is ODS - ContainerLogsRouteADX = false //default is LA + ContainerLogsRouteV2 = false + ContainerLogsRouteADX = false - if strings.Compare(ContainerLogsRoute, ContainerLogsV2Route) == 0 && strings.Compare(strings.ToLower(osType), "windows") != 0 { - ContainerLogsRouteV2 = true - Log("Routing container logs thru %s route...", ContainerLogsV2Route) - fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsV2Route) - } else if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { + if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { //check if adx clusteruri, clientid & secret are set var err error AdxClusterUri, err = ReadFileContents(PluginConfiguration["adx_cluster_uri_path"]) @@ -1455,14 +1653,30 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Routing container logs thru %s route...", ContainerLogsADXRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route...\n", ContainerLogsADXRoute) } - } + } else if strings.Compare(strings.ToLower(osType), "windows") != 0 { //for linux, oneagent will be default route + ContainerLogsRouteV2 = true //default is mdsd route + if strings.Compare(ContainerLogsRoute, ContainerLogsV1Route) == 0 { + ContainerLogsRouteV2 = false //fallback option when hiddensetting set + } + Log("Routing container logs thru %s route...", ContainerLogsRoute) + fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsRoute) + } if ContainerLogsRouteV2 == true { - CreateMDSDClient() + CreateMDSDClient(ContainerLogV2, ContainerType) } else if ContainerLogsRouteADX == true { CreateADXClient() + } else { // v1 or windows + Log("Creating HTTP Client since either OS Platform is Windows or configmap configured with fallback option for ODS direct") + CreateHTTPClient() } + if IsWindows == false { // mdsd linux specific + Log("Creating MDSD clients for KubeMonAgentEvents & InsightsMetrics") + CreateMDSDClient(KubeMonAgentEvents, ContainerType) + CreateMDSDClient(InsightsMetrics, ContainerType) + } + ContainerLogSchemaVersion := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOG_SCHEMA_VERSION"))) Log("AZMON_CONTAINER_LOG_SCHEMA_VERSION:%s", ContainerLogSchemaVersion) @@ -1491,4 +1705,12 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Running in replicaset. Disabling container enrichment caching & updates \n") } + if ContainerLogSchemaV2 == true { + MdsdContainerLogTagName = MdsdContainerLogV2SourceName + } else { + MdsdContainerLogTagName = MdsdContainerLogSourceName + } + + MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName } \ No newline at end of file diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 461fdea96..4750b4624 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -42,6 +42,10 @@ var ( ContainerLogsSendErrorsToMDSDFromFluent float64 //Tracks the number of mdsd client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsMDSDClientCreateErrors float64 + //Tracks the number of mdsd client create errors for insightsmetrics (uses ContainerLogTelemetryTicker) + InsightsMetricsMDSDClientCreateErrors float64 + //Tracks the number of mdsd client create errors for kubemonevents (uses ContainerLogTelemetryTicker) + KubeMonEventsMDSDClientCreateErrors float64 //Tracks the number of write/send errors to ADX for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsSendErrorsToADXFromFluent float64 //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) @@ -74,6 +78,8 @@ const ( metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" + metricNameErrorCountInsightsMetricsMDSDClientCreateError = "InsightsMetricsMDSDClientCreateErrorsCount" + metricNameErrorCountKubeMonEventsMDSDClientCreateError = "KubeMonEventsMDSDClientCreateErrorsCount" metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" @@ -112,6 +118,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent containerLogsADXClientCreateErrors := ContainerLogsADXClientCreateErrors + insightsMetricsMDSDClientCreateErrors := InsightsMetricsMDSDClientCreateErrors + kubeMonEventsMDSDClientCreateErrors := KubeMonEventsMDSDClientCreateErrors osmNamespaceCount := OSMNamespaceCount promMonitorPods := PromMonitorPods promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength @@ -132,6 +140,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogsMDSDClientCreateErrors = 0.0 ContainerLogsSendErrorsToADXFromFluent = 0.0 ContainerLogsADXClientCreateErrors = 0.0 + InsightsMetricsMDSDClientCreateErrors = 0.0 + KubeMonEventsMDSDClientCreateErrors = 0.0 ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { @@ -186,6 +196,13 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { if containerLogsADXClientCreateErrors > 0.0 { TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountContainerLogsADXClientCreateError, containerLogsADXClientCreateErrors)) } + if insightsMetricsMDSDClientCreateErrors > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountInsightsMetricsMDSDClientCreateError, insightsMetricsMDSDClientCreateErrors)) + } + if kubeMonEventsMDSDClientCreateErrors > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountKubeMonEventsMDSDClientCreateError, kubeMonEventsMDSDClientCreateErrors)) + } + start = time.Now() } } diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 61d047e52..3fe5c6d0e 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -12,11 +12,12 @@ import ( "net/url" "os" "strings" - "time" - + "time" + "github.com/Azure/azure-kusto-go/kusto" "github.com/Azure/azure-kusto-go/kusto/ingest" "github.com/Azure/go-autorest/autorest/azure/auth" + "github.com/tinylib/msgp/msgp" ) // ReadConfiguration reads a property file @@ -62,7 +63,13 @@ func ReadConfiguration(filename string) (map[string]string, error) { // CreateHTTPClient used to create the client for sending post requests to OMSEndpoint func CreateHTTPClient() { - cert, err := tls.LoadX509KeyPair(PluginConfiguration["cert_file_path"], PluginConfiguration["key_file_path"]) + certFilePath := PluginConfiguration["cert_file_path"] + keyFilePath := PluginConfiguration["key_file_path"] + if IsWindows == false { + certFilePath = fmt.Sprintf(certFilePath, WorkspaceID) + keyFilePath = fmt.Sprintf(keyFilePath, WorkspaceID) + } + cert, err := tls.LoadX509KeyPair(certFilePath, keyFilePath) if err != nil { message := fmt.Sprintf("Error when loading cert %s", err.Error()) SendException(message) @@ -93,7 +100,7 @@ func CreateHTTPClient() { HTTPClient = http.Client{ Transport: transport, Timeout: 30 * time.Second, - } + } Log("Successfully created HTTP Client") } @@ -110,23 +117,58 @@ func ToString(s interface{}) string { } //mdsdSocketClient to write msgp messages -func CreateMDSDClient() { - if MdsdMsgpUnixSocketClient != nil { - MdsdMsgpUnixSocketClient.Close() - MdsdMsgpUnixSocketClient = nil - } - /*conn, err := fluent.New(fluent.Config{FluentNetwork:"unix", - FluentSocketPath:"/var/run/mdsd/default_fluent.socket", - WriteTimeout: 5 * time.Second, - RequestAck: true}) */ - conn, err := net.DialTimeout("unix", - "/var/run/mdsd/default_fluent.socket", 10*time.Second) - if err != nil { - Log("Error::mdsd::Unable to open MDSD msgp socket connection %s", err.Error()) - //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) - } else { - Log("Successfully created MDSD msgp socket connection") - MdsdMsgpUnixSocketClient = conn +func CreateMDSDClient(dataType DataType, containerType string) { + mdsdfluentSocket := "/var/run/mdsd/default_fluent.socket" + if containerType != "" && strings.Compare(strings.ToLower(containerType), "prometheussidecar") == 0 { + mdsdfluentSocket = fmt.Sprintf("/var/run/mdsd-%s/default_fluent.socket", containerType) + } + switch dataType { + case ContainerLogV2: + if MdsdMsgpUnixSocketClient != nil { + MdsdMsgpUnixSocketClient.Close() + MdsdMsgpUnixSocketClient = nil + } + /*conn, err := fluent.New(fluent.Config{FluentNetwork:"unix", + FluentSocketPath:"/var/run/mdsd/default_fluent.socket", + WriteTimeout: 5 * time.Second, + RequestAck: true}) */ + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for ContainerLogV2 %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for ContainerLogV2: %s", mdsdfluentSocket) + MdsdMsgpUnixSocketClient = conn + } + case KubeMonAgentEvents: + if MdsdKubeMonMsgpUnixSocketClient != nil { + MdsdKubeMonMsgpUnixSocketClient.Close() + MdsdKubeMonMsgpUnixSocketClient = nil + } + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for KubeMon events %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for KubeMon events:%s", mdsdfluentSocket) + MdsdKubeMonMsgpUnixSocketClient = conn + } + case InsightsMetrics: + if MdsdInsightsMetricsMsgpUnixSocketClient != nil { + MdsdInsightsMetricsMsgpUnixSocketClient.Close() + MdsdInsightsMetricsMsgpUnixSocketClient = nil + } + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for insights metrics %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for Insights metrics %s", mdsdfluentSocket) + MdsdInsightsMetricsMsgpUnixSocketClient = conn + } } } @@ -178,3 +220,33 @@ func isValidUrl(uri string) bool { } return true } + +func convertMsgPackEntriesToMsgpBytes(fluentForwardTag string, msgPackEntries []MsgPackEntry) []byte { + var msgpBytes []byte + + fluentForward := MsgPackForward{ + Tag: fluentForwardTag, + Entries: msgPackEntries, + } + //determine the size of msgp message + msgpSize := 1 + msgp.StringPrefixSize + len(fluentForward.Tag) + msgp.ArrayHeaderSize + for i := range fluentForward.Entries { + msgpSize += 1 + msgp.Int64Size + msgp.GuessSize(fluentForward.Entries[i].Record) + } + + //allocate buffer for msgp message + msgpBytes = msgp.Require(nil, msgpSize) + + //construct the stream + msgpBytes = append(msgpBytes, 0x92) + msgpBytes = msgp.AppendString(msgpBytes, fluentForward.Tag) + msgpBytes = msgp.AppendArrayHeader(msgpBytes, uint32(len(fluentForward.Entries))) + batchTime := time.Now().Unix() + for entry := range fluentForward.Entries { + msgpBytes = append(msgpBytes, 0x92) + msgpBytes = msgp.AppendInt64(msgpBytes, batchTime) + msgpBytes = msgp.AppendMapStrStr(msgpBytes, fluentForward.Entries[entry].Record) + } + + return msgpBytes +} diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index b118cc646..6ae567337 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -14,7 +14,6 @@ class ApplicationInsightsUtility @@Exception = "ExceptionEvent" @@AcsClusterType = "ACS" @@AksClusterType = "AKS" - @OmsAdminFilePath = "/etc/opt/microsoft/omsagent/conf/omsadmin.conf" @@EnvAcsResourceName = "ACS_RESOURCE_NAME" @@EnvAksRegion = "AKS_REGION" @@EnvAgentVersion = "AGENT_VERSION" @@ -263,14 +262,11 @@ def sendMetricTelemetry(metricName, metricValue, properties) end def getWorkspaceId() - begin - adminConf = {} - confFile = File.open(@OmsAdminFilePath, "r") - confFile.each_line do |line| - splitStrings = line.split("=") - adminConf[splitStrings[0]] = splitStrings[1] + begin + workspaceId = ENV["WSID"] + if workspaceId.nil? || workspaceId.empty? + $log.warn("Exception in AppInsightsUtility: getWorkspaceId - WorkspaceID either nil or empty") end - workspaceId = adminConf["WORKSPACE_ID"] return workspaceId rescue => errorStr $log.warn("Exception in AppInsightsUtility: getWorkspaceId - error: #{errorStr}") @@ -278,14 +274,8 @@ def getWorkspaceId() end def getWorkspaceCloud() - begin - adminConf = {} - confFile = File.open(@OmsAdminFilePath, "r") - confFile.each_line do |line| - splitStrings = line.split("=") - adminConf[splitStrings[0]] = splitStrings[1] - end - workspaceDomain = adminConf["URL_TLD"].strip + begin + workspaceDomain = ENV["DOMAIN"] workspaceCloud = "AzureCloud" if workspaceDomain.casecmp("opinsights.azure.com") == 0 workspaceCloud = "AzureCloud" diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 8cb6f603e..f02459aef 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -203,23 +203,25 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = metricPollTime #container["cpu"]["time"] - metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue + - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) + metricItem = {} + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json + metricItems.push(metricItem) + #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -250,11 +252,8 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met telemetryProps["dsPromUrl"] = @dsPromUrlCount end #telemetry about containerlog Routing for daemonset - if File.exist?(Constants::AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME) - telemetryProps["containerLogsRoute"] = "v2" - elsif (!@containerLogsRoute.nil? && !@containerLogsRoute.empty?) - telemetryProps["containerLogsRoute"] = @containerLogsRoute - end + telemetryProps["containerLogsRoute"] = @containerLogsRoute + #telemetry about health model if (!@hmEnabled.nil? && !@hmEnabled.empty?) telemetryProps["hmEnabled"] = @hmEnabled @@ -503,18 +502,16 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = metricPollTime #container["cpu"]["time"] + metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricItem["json_Collections"] = [] + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn containerId = podUid + "/" + containerName # Adding the containers to the winContainerIdCache so that it can be used by the cleanup routine @@ -545,9 +542,11 @@ def getContainerCpuMetricItemRate(metricJSON, hostName, cpuMetricNameToCollect, @@winContainerPrevMetricRate[containerId] = metricRateValue end - metricCollections["Value"] = metricValue - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricCollection["Value"] = metricValue + + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json metricItems.push(metricItem) #Telemetry about agent performance begin @@ -629,22 +628,21 @@ def getContainerMemoryMetricItems(metricJSON, hostName, memoryMetricNameToCollec metricTime = metricPollTime #container["memory"]["time"] metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json + metricItems.push(metricItem) - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) #Telemetry about agent performance begin # we can only do this much now. Ideally would like to use the docker image repository to find our pods/containers @@ -687,22 +685,21 @@ def getNodeMetricItem(metricJSON, hostName, metricCategory, metricNameToCollect, if !node[metricCategory].nil? metricValue = node[metricCategory][metricNameToCollect] metricTime = metricPollTime #node[metricCategory]["time"] - - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE - metricProps["InstanceName"] = clusterId + "/" + nodeName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE + metricItem["InstanceName"] = clusterId + "/" + nodeName + + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json end rescue => error @Log.warn("getNodeMetricItem failed: #{error} for metric #{metricNameToCollect}") @@ -805,21 +802,20 @@ def getNodeMetricItemRate(metricJSON, hostName, metricCategory, metricNameToColl end end end - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE - metricProps["InstanceName"] = clusterId + "/" + nodeName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE + metricItem["InstanceName"] = clusterId + "/" + nodeName + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json end rescue => error @Log.warn("getNodeMetricItemRate failed: #{error} for metric #{metricNameToCollect}") @@ -841,22 +837,22 @@ def getNodeLastRebootTimeMetric(metricJSON, hostName, metricNametoReturn, metric metricValue = node["startTime"] metricTime = metricPollTime #Time.now.utc.iso8601 #2018-01-30T19:36:14Z - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE - metricProps["InstanceName"] = clusterId + "/" + nodeName + + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_NODE + metricItem["InstanceName"] = clusterId + "/" + nodeName - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn #Read it from /proc/uptime - metricCollections["Value"] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f + metricCollection["Value"] = DateTime.parse(metricTime).to_time.to_i - IO.read("/proc/uptime").split[0].to_f - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json rescue => error @Log.warn("getNodeLastRebootTimeMetric failed: #{error} ") @Log.warn metricJSON @@ -880,21 +876,19 @@ def getContainerStartTimeMetricItems(metricJSON, hostName, metricNametoReturn, m metricTime = metricPollTime #currentTime metricItem = {} - metricItem["DataItems"] = [] - - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = hostName - metricProps["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER - metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = DateTime.parse(metricValue).to_time.to_i - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) + metricItem["Timestamp"] = metricTime + metricItem["Host"] = hostName + metricItem["ObjectName"] = Constants::OBJECT_NAME_K8S_CONTAINER + metricItem["InstanceName"] = clusterId + "/" + podUid + "/" + containerName + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = DateTime.parse(metricValue).to_time.to_i + + metricItem["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricItem["json_Collections"] = metricCollections.to_json metricItems.push(metricItem) end end diff --git a/source/plugins/ruby/DockerApiClient.rb b/source/plugins/ruby/DockerApiClient.rb index f2828b357..53dd1f39f 100644 --- a/source/plugins/ruby/DockerApiClient.rb +++ b/source/plugins/ruby/DockerApiClient.rb @@ -29,7 +29,7 @@ def getResponse(request, isMultiJson, isVersion) loop do begin responseChunk = "" - timeout(@@TimeoutInSeconds) do + Timeout.timeout(@@TimeoutInSeconds) do responseChunk = socket.recv(@@ChunkSize) end dockerResponse += responseChunk diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 98347d272..3720bf6dc 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -405,12 +405,9 @@ def getPodUid(podNameSpace, podMetadata) def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] - timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs - timeDifferenceInMinutes = timeDifference / 60 begin clusterId = getClusterId podNameSpace = pod["metadata"]["namespace"] - podName = pod["metadata"]["name"] podUid = getPodUid(podNameSpace, pod["metadata"]) if podUid.nil? return metricItems @@ -442,9 +439,6 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle if (!container["resources"].nil? && !container["resources"].empty? && !container["resources"][metricCategory].nil? && !container["resources"][metricCategory][metricNameToCollect].nil?) metricValue = getMetricNumericValue(metricNameToCollect, container["resources"][metricCategory][metricNameToCollect]) - metricItem = {} - metricItem["DataItems"] = [] - metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName @@ -453,50 +447,22 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) - #Telemetry about omsagent requests and limits - begin - if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) - nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~") - @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue - end - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - @@resourceLimitsTelemetryHash.each { |key, value| - keyElements = key.split("~~") - if keyElements.length != 4 - next - end - - # get dimension values by key - telemetryProps = {} - telemetryProps["Computer"] = keyElements[0] - telemetryProps["PodName"] = keyElements[1] - telemetryProps["ContainerName"] = keyElements[2] - metricNameFromKey = keyElements[3] - ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps) - } - @@telemetryTimeTracker = DateTime.now.to_time.to_i - @@resourceLimitsTelemetryHash = {} - end - rescue => errorStr - $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") - end + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + + metricProps["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricProps["json_Collections"] = metricCollections.to_json + metricItems.push(metricProps) #No container level limit for the given metric, so default to node level limit else nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect if (metricCategory == "limits" && @@NodeMetrics.has_key?(nodeMetricsHashKey)) metricValue = @@NodeMetrics[nodeMetricsHashKey] #@Log.info("Limits not set for container #{clusterId + "/" + podUid + "/" + containerName} using node level limits: #{nodeMetricsHashKey}=#{metricValue} ") - metricItem = {} - metricItem["DataItems"] = [] - + metricProps = {} metricProps["Timestamp"] = metricTime metricProps["Host"] = nodeName @@ -505,14 +471,14 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["ObjectName"] = "K8SContainer" metricProps["InstanceName"] = clusterId + "/" + podUid + "/" + containerName - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - metricItems.push(metricItem) + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + metricProps["json_Collections"] = [] + metricCollections = [] + metricCollections.push(metricCollection) + metricProps["json_Collections"] = metricCollections.to_json + metricItems.push(metricProps) end end end @@ -632,22 +598,22 @@ def parseNodeLimitsFromNodeItem(node, metricCategory, metricNameToCollect, metri # metricCategory can be "capacity" or "allocatable" and metricNameToCollect can be "cpu" or "memory" metricValue = getMetricNumericValue(metricNameToCollect, node["status"][metricCategory][metricNameToCollect]) - metricItem["DataItems"] = [] - metricProps = {} - metricProps["Timestamp"] = metricTime - metricProps["Host"] = node["metadata"]["name"] + metricItem["Timestamp"] = metricTime + metricItem["Host"] = node["metadata"]["name"] # Adding this so that it is not set by base omsagent since it was not set earlier and being set by base omsagent - metricProps["Computer"] = node["metadata"]["name"] - metricProps["ObjectName"] = "K8SNode" - metricProps["InstanceName"] = clusterId + "/" + node["metadata"]["name"] - metricProps["Collections"] = [] - metricCollections = {} - metricCollections["CounterName"] = metricNametoReturn - metricCollections["Value"] = metricValue - - metricProps["Collections"].push(metricCollections) - metricItem["DataItems"].push(metricProps) - + metricItem["Computer"] = node["metadata"]["name"] + metricItem["ObjectName"] = "K8SNode" + metricItem["InstanceName"] = clusterId + "/" + node["metadata"]["name"] + + metricCollection = {} + metricCollection["CounterName"] = metricNametoReturn + metricCollection["Value"] = metricValue + metricCollections = [] + metricCollections.push(metricCollection) + + metricItem["json_Collections"] = [] + metricItem["json_Collections"] = metricCollections.to_json + #push node level metrics to a inmem hash so that we can use it looking up at container level. #Currently if container level cpu & memory limits are not defined we default to node level limits @@NodeMetrics[clusterId + "/" + node["metadata"]["name"] + "_" + metricCategory + "_" + metricNameToCollect] = metricValue diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 6641456af..a809087dc 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -525,11 +525,11 @@ def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_m records = [] begin custommetricrecord = MdmAlertTemplates::Node_resource_metrics_template % { - timestamp: record["DataItems"][0]["Timestamp"], + timestamp: record["Timestamp"], metricName: metric_name, - hostvalue: record["DataItems"][0]["Host"], - objectnamevalue: record["DataItems"][0]["ObjectName"], - instancenamevalue: record["DataItems"][0]["InstanceName"], + hostvalue: record["Host"], + objectnamevalue: record["ObjectName"], + instancenamevalue: record["InstanceName"], metricminvalue: metric_value, metricmaxvalue: metric_value, metricsumvalue: metric_value, @@ -538,11 +538,11 @@ def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_m if !percentage_metric_value.nil? additional_record = MdmAlertTemplates::Node_resource_metrics_template % { - timestamp: record["DataItems"][0]["Timestamp"], + timestamp: record["Timestamp"], metricName: @@node_metric_name_metric_percentage_name_hash[metric_name], - hostvalue: record["DataItems"][0]["Host"], - objectnamevalue: record["DataItems"][0]["ObjectName"], - instancenamevalue: record["DataItems"][0]["InstanceName"], + hostvalue: record["Host"], + objectnamevalue: record["ObjectName"], + instancenamevalue: record["InstanceName"], metricminvalue: percentage_metric_value, metricmaxvalue: percentage_metric_value, metricsumvalue: percentage_metric_value, diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 906019b95..c037c99f6 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -103,5 +103,5 @@ class Constants #Pod Statuses POD_STATUS_TERMINATING = "Terminating" - AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2_FILENAME = "/opt/AZMON_CONTAINER_LOGS_EFFECTIVE_ROUTE_V2" + end diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 659e3000c..62dcf31dc 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -2,7 +2,9 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require "logger" require "yajl/json_gem" require_relative "oms_common" @@ -12,7 +14,7 @@ module Fluent require_relative "in_kube_nodes" class CAdvisor2MdmFilter < Filter - Fluent::Plugin.register_filter("filter_cadvisor2mdm", self) + Fluent::Plugin.register_filter("cadvisor2mdm", self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" @@ -65,7 +67,7 @@ def start @containerResourceDimensionHash = {} @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds - @NodeCache = Fluent::NodeStatsCache.new() + @NodeCache = Fluent::Plugin::NodeStatsCache.new() end rescue => e @log.info "Error initializing plugin #{e}" @@ -148,16 +150,16 @@ def filter(tag, time, record) begin if @process_incoming_stream - # Check if insights metrics for PV metrics - data_type = record["DataType"] - if data_type == "INSIGHTS_METRICS_BLOB" + # Check if insights metrics for PV metrics + if record["Name"] == Constants::PV_USED_BYTES return filterPVInsightsMetrics(record) end - object_name = record["DataItems"][0]["ObjectName"] - counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] + object_name = record["ObjectName"] + counter_name = JSON.parse(record["json_Collections"])[0]["CounterName"] + percentage_metric_value = 0.0 - metric_value = record["DataItems"][0]["Collections"][0]["Value"] + metric_value = JSON.parse(record["json_Collections"])[0]["Value"] if object_name == Constants::OBJECT_NAME_K8S_NODE && @metrics_to_collect_hash.key?(counter_name.downcase) # Compute and send % CPU and Memory @@ -165,7 +167,7 @@ def filter(tag, time, record) metric_name = Constants::CPU_USAGE_MILLI_CORES metric_value /= 1000000 #cadvisor record is in nanocores. Convert to mc if @@controller_type.downcase == "replicaset" - target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["DataItems"][0]["Host"]) / 1000000 + target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["Host"]) / 1000000 else target_node_cpu_capacity_mc = @cpu_capacity end @@ -178,7 +180,7 @@ def filter(tag, time, record) if counter_name.start_with?("memory") metric_name = counter_name if @@controller_type.downcase == "replicaset" - target_node_mem_capacity = @NodeCache.mem.get_capacity(record["DataItems"][0]["Host"]) + target_node_mem_capacity = @NodeCache.mem.get_capacity(record["Host"]) else target_node_mem_capacity = @memory_capacity end @@ -187,12 +189,12 @@ def filter(tag, time, record) percentage_metric_value = metric_value * 100 / target_node_mem_capacity end end - @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["DataItems"][0]["Host"]} percentage: #{percentage_metric_value}" + @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["Host"]} percentage: #{percentage_metric_value}" # do some sanity checking. Do we want this? if percentage_metric_value > 100.0 or percentage_metric_value < 0.0 telemetryProperties = {} - telemetryProperties["Computer"] = record["DataItems"][0]["Host"] + telemetryProperties["Computer"] = record["Host"] telemetryProperties["MetricName"] = metric_name telemetryProperties["MetricPercentageValue"] = percentage_metric_value ApplicationInsightsUtility.sendCustomEvent("ErrorPercentageOutOfBounds", telemetryProperties) @@ -200,7 +202,7 @@ def filter(tag, time, record) return MdmMetricsGenerator.getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value) elsif object_name == Constants::OBJECT_NAME_K8S_CONTAINER && @metrics_to_collect_hash.key?(counter_name.downcase) - instanceName = record["DataItems"][0]["InstanceName"] + instanceName = record["InstanceName"] metricName = counter_name # Using node cpu capacity in the absence of container cpu capacity since the container will end up using the # node's capacity in this case. Converting this to nanocores for computation purposes, since this is in millicores @@ -235,7 +237,7 @@ def filter(tag, time, record) flushMetricTelemetry if percentage_metric_value >= thresholdPercentage setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(record["DataItems"][0]["Timestamp"], + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(record["Timestamp"], metricName, percentage_metric_value, @containerResourceDimensionHash[instanceName], @@ -256,39 +258,36 @@ def filter(tag, time, record) end end - def filterPVInsightsMetrics(record) + def filterPVInsightsMetrics(record) begin mdmMetrics = [] - record["DataItems"].each do |dataItem| - - if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] - if capacity != 0 - percentage_metric_value = (usage * 100.0) / capacity - end - @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - computer = dataItem["Computer"] - resourceDimensions = dataItem["Tags"] - thresholdPercentage = @@metric_threshold_hash[metricName] - - flushMetricTelemetry - if percentage_metric_value >= thresholdPercentage - setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], - metricName, - computer, - percentage_metric_value, - resourceDimensions, - thresholdPercentage) - else - return [] - end # end if block for percentage metric > configured threshold % check - end # end if block for dataItem name check - end # end for block of looping through data items + if record["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(record["Name"].downcase) + metricName = record["Name"] + usage = record["Value"] + capacity = record["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + computer = record["Computer"] + resourceDimensions = record["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] + + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(record["CollectionTime"], + metricName, + computer, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check return [] rescue Exception => e @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}" @@ -316,16 +315,22 @@ def ensure_cpu_memory_capacity_set end if !nodeInventory.nil? cpu_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "cpu", "cpuCapacityNanoCores") - if !cpu_capacity_json.nil? && !cpu_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"].to_s.nil? - @cpu_capacity = cpu_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"] - @log.info "CPU Limit #{@cpu_capacity}" + if !cpu_capacity_json.nil? + metricVal = JSON.parse(cpu_capacity_json[0]["json_Collections"])[0]["Value"] + if !metricVal.to_s.nil? + @cpu_capacity = metricVal + @log.info "CPU Limit #{@cpu_capacity}" + end else @log.info "Error getting cpu_capacity" end memory_capacity_json = KubernetesApiClient.parseNodeLimits(nodeInventory, "capacity", "memory", "memoryCapacityBytes") - if !memory_capacity_json.nil? && !memory_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"].to_s.nil? - @memory_capacity = memory_capacity_json[0]["DataItems"][0]["Collections"][0]["Value"] - @log.info "Memory Limit #{@memory_capacity}" + if !memory_capacity_json.nil? + metricVal = JSON.parse(cpu_capacity_json[0]["json_Collections"])[0]["Value"] + if !metricVal.to_s.nil? + @memory_capacity = metricVal + @log.info "Memory Limit #{@memory_capacity}" + end else @log.info "Error getting memory_capacity" end @@ -346,7 +351,7 @@ def ensure_cpu_memory_capacity_set end def filter_stream(tag, es) - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new begin ensure_cpu_memory_capacity_set # Getting container limits hash diff --git a/source/plugins/ruby/filter_cadvisor_health_container.rb b/source/plugins/ruby/filter_cadvisor_health_container.rb index 870fcd6d6..ab64b6e61 100644 --- a/source/plugins/ruby/filter_cadvisor_health_container.rb +++ b/source/plugins/ruby/filter_cadvisor_health_container.rb @@ -1,7 +1,9 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' require_relative 'oms_common' @@ -11,7 +13,7 @@ module Fluent class CAdvisor2ContainerHealthFilter < Filter include HealthModel - Fluent::Plugin.register_filter('filter_cadvisor_health_container', self) + Fluent::Plugin.register_filter('cadvisor_health_container', self) config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/health_monitors.log' config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' @@ -58,9 +60,9 @@ def start def filter_stream(tag, es) if !@@cluster_health_model_enabled @log.info "Cluster Health Model disabled in filter_cadvisor_health_container" - return MultiEventStream.new + return Fluent::MultiEventStream.new end - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new records_count = 0 es.each { |time, record| begin @@ -83,8 +85,9 @@ def filter(tag, time, record) if record.key?("MonitorLabels") return record end - object_name = record['DataItems'][0]['ObjectName'] - counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + + object_name = record['ObjectName'] + counter_name = JSON.parse(record['json_Collections'])[0]['CounterName'].downcase if @metrics_to_collect_hash.key?(counter_name) if object_name == @@object_name_k8s_container return @formatter.get_record_from_cadvisor_record(record) diff --git a/source/plugins/ruby/filter_cadvisor_health_node.rb b/source/plugins/ruby/filter_cadvisor_health_node.rb index 27e5bc255..ddbb871e8 100644 --- a/source/plugins/ruby/filter_cadvisor_health_node.rb +++ b/source/plugins/ruby/filter_cadvisor_health_node.rb @@ -1,7 +1,9 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' require_relative 'oms_common' @@ -11,8 +13,8 @@ module Fluent class CAdvisor2NodeHealthFilter < Filter include HealthModel - Fluent::Plugin.register_filter('filter_cadvisor_health_node', self) - + Fluent::Plugin.register_filter('cadvisor_health_node', self) + attr_accessor :provider, :resources config_param :metrics_to_collect, :string, :default => 'cpuUsageNanoCores,memoryRssBytes' @@ -75,13 +77,13 @@ def start def filter_stream(tag, es) if !@@cluster_health_model_enabled @log.info "Cluster Health Model disabled in filter_cadvisor_health_node" - return MultiEventStream.new + return Fluent::MultiEventStream.new end begin node_capacity = HealthMonitorUtils.ensure_cpu_memory_capacity_set(@@hm_log, @cpu_capacity, @memory_capacity, @@hostName) @cpu_capacity = node_capacity[0] @memory_capacity = node_capacity[1] - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new records_count = 0 es.each { |time, record| filtered_record = filter(tag, time, record) @@ -95,7 +97,7 @@ def filter_stream(tag, es) rescue => e @log.info "Error in filter_cadvisor_health_node filter_stream #{e.backtrace}" ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) - return MultiEventStream.new + return Fluent::MultiEventStream.new end end @@ -105,10 +107,10 @@ def filter(tag, time, record) return record end - object_name = record['DataItems'][0]['ObjectName'] - counter_name = record['DataItems'][0]['Collections'][0]['CounterName'].downcase + object_name = record['ObjectName'] + counter_name = JSON.parse(record['json_Collections'])[0]['CounterName'].downcase if @metrics_to_collect_hash.key?(counter_name.downcase) - metric_value = record['DataItems'][0]['Collections'][0]['Value'] + metric_value = JSON.parse(record['json_Collections'])[0]['Value'] case object_name when @@object_name_k8s_node case counter_name.downcase @@ -134,14 +136,14 @@ def process_node_cpu_record(record, metric_value) if record.nil? return nil else - instance_name = record['DataItems'][0]['InstanceName'] + instance_name = record['InstanceName'] #@log.info "CPU capacity #{@cpu_capacity}" metric_value /= 1000000 percent = (metric_value.to_f/@cpu_capacity*100).round(2) #@log.debug "Percentage of CPU limit: #{percent}" state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(MonitorId::NODE_CPU_MONITOR_ID)) #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] + timestamp = record['Timestamp'] health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"cpuUsageMillicores" => metric_value, "cpuUtilizationPercentage" => percent}} monitor_instance_id = HealthMonitorUtils.get_monitor_instance_id(monitor_id, [@@clusterId, @@hostName]) @@ -166,14 +168,14 @@ def process_node_memory_record(record, metric_value) if record.nil? return nil else - instance_name = record['DataItems'][0]['InstanceName'] + instance_name = record['InstanceName'] #@log.info "Memory capacity #{@memory_capacity}" percent = (metric_value.to_f/@memory_capacity*100).round(2) #@log.debug "Percentage of Memory limit: #{percent}" state = HealthMonitorUtils.compute_percentage_state(percent, @provider.get_config(MonitorId::NODE_MEMORY_MONITOR_ID)) #@log.debug "Computed State : #{state}" - timestamp = record['DataItems'][0]['Timestamp'] + timestamp = record['Timestamp'] health_monitor_record = {"timestamp" => timestamp, "state" => state, "details" => {"memoryRssBytes" => metric_value.to_f, "memoryUtilizationPercentage" => percent}} #@log.info health_monitor_record diff --git a/source/plugins/ruby/filter_container.rb b/source/plugins/ruby/filter_container.rb deleted file mode 100644 index b72e82dbc..000000000 --- a/source/plugins/ruby/filter_container.rb +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. - -# frozen_string_literal: true - -module Fluent - require 'logger' - - class ContainerFilter < Filter - Fluent::Plugin.register_filter('filter_container', self) - - config_param :enable_log, :integer, :default => 0 - config_param :log_path, :string, :default => '/var/opt/microsoft/omsagent/log/filter_container.log' - - def initialize - super - end - - def configure(conf) - super - @log = nil - - if @enable_log - @log = Logger.new(@log_path, 'weekly') - @log.debug {'Starting filter_container plugin'} - end - end - - def start - super - end - - def shutdown - super - end - - def filter(tag, time, record) - dataType = nil - - record.each do |r| - if dataType == nil - dataType = case r["ClassName"] - when "Container_ImageInventory" then "CONTAINER_IMAGE_INVENTORY_BLOB" - when "Container_ContainerInventory" then "CONTAINER_INVENTORY_BLOB" - when "Container_DaemonEvent" then "CONTAINER_SERVICE_LOG_BLOB" - when "Container_ContainerLog" then "CONTAINER_LOG_BLOB" - end - end - end - - wrapper = { - "DataType"=>dataType, - "IPName"=>"Containers", - "DataItems"=>record - } - - wrapper - end - end -end diff --git a/source/plugins/ruby/filter_docker_log.rb b/source/plugins/ruby/filter_docker_log.rb deleted file mode 100644 index b80f4c204..000000000 --- a/source/plugins/ruby/filter_docker_log.rb +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. - -# frozen_string_literal: true - -module Fluent - require 'logger' - require 'socket' - require 'yajl/json_gem' - - class DockerLogFilter < Filter - Plugin.register_filter('filter_docker_log', self) - - # Set to 1 in config file to enable logging - config_param :enable_log, :integer, :default => 0 - config_param :log_path, :string, :default => '/var/opt/microsoft/omsagent/log/filter_docker_log.txt' - - # This method is called before starting. - def configure(conf) - super - @hostname = Socket.gethostname - # in case get full name, extract up to '.' - dotpos = @hostname.index('.') - if dotpos != nil - @hostname = @hostname[0..dotpos-1] - end - - # Cache the image name and ID of each container so we don't have to inspect each time - @containerCache = Hash.new - - @log = nil - - if @enable_log - @log = Logger.new(@log_path, 'weekly') - @log.debug {'Starting filter_docker_log plugin on ' + @hostname} - end - end - - def filter(tag, time, record) - if @log != nil - @log.debug {'Accepted a log from container ' + record['container_id']} - end - - wrapper = Hash.new - - if record['log'].empty? - if @log != nil - @log.debug {'Log from container ' + record['container_id'] + ' had length 0 and will be discarded'} - end - else - # Need to query image information from ID - containerId = record['container_id'] - - unless @containerCache.has_key?(containerId) - if @log != nil - @log.debug {'Container ' + containerId + ' information is not in the cache, inspecting'} - end - - # Value not in cache, use inspect - @containerCache[containerId] = Hash.new - details = '' - - begin - details = JSON.parse(`sudo docker inspect #{containerId}`) - rescue => e - if @log != nil - @log.error {'sudo docker inspect ' + containerId + ' failed'} - end - end - - if details.empty? - # This should not occur - @containerCache[containerId]['Image'] = 'Unknown' - @containerCache[containerId]['ImageName'] = 'Unknown' - - if @log != nil - @log.warn {'The image ID of container ' + containerId + ' could not be determined'} - end - else - @containerCache[containerId]['Image'] = details[0]['Config']['Image'] - @containerCache[containerId]['ImageName'] = details[0]['Config']['Image'] - end - end - - newRecord = @containerCache[containerId] - - # No query is required - newRecord['Id'] = containerId - newRecord['Name'] = record['container_name'][0] == "/" ? record['container_name'][1..-1] : record['container_name'] - newRecord['LogEntrySource'] = record['source'] - newRecord['LogEntry'] = record['log'] - newRecord['Computer'] = @hostname - - wrapper = { - "DataType"=>"CONTAINER_LOG_BLOB", - "IPName"=>"Containers", - "DataItems"=>[newRecord] - } - end - - wrapper - end - end -end diff --git a/source/plugins/ruby/filter_health_model_builder.rb b/source/plugins/ruby/filter_health_model_builder.rb index 36e4801d7..d491f17c2 100644 --- a/source/plugins/ruby/filter_health_model_builder.rb +++ b/source/plugins/ruby/filter_health_model_builder.rb @@ -2,15 +2,17 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } - + class FilterHealthModelBuilder < Filter include HealthModel - Fluent::Plugin.register_filter('filter_health_model_builder', self) + Fluent::Plugin.register_filter('health_model_builder', self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_health_model_builder.log' @@ -20,7 +22,7 @@ class FilterHealthModelBuilder < Filter attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator, :telemetry - @@rewrite_tag = 'kubehealth.Signals' + @@cluster_id = KubernetesApiClient.getClusterId @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @@ -29,6 +31,7 @@ class FilterHealthModelBuilder < Filter def initialize begin super + @rewrite_tag = 'oneagent.containerInsights.KUBE_HEALTH_BLOB' @buffer = HealthModel::HealthModelBuffer.new @cluster_health_state = ClusterHealthState.new(@@token_file_path, @@cert_file_path) @health_model_definition = HealthModel::ParentMonitorProvider.new(HealthModel::HealthModelDefinitionParser.new(@model_definition_path).parse_file) @@ -53,6 +56,7 @@ def initialize deserialized_state_info = @cluster_health_state.get_state @state.initialize_state(deserialized_state_info) end + rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -82,11 +86,11 @@ def shutdown def filter_stream(tag, es) if !@@cluster_health_model_enabled @log.info "Cluster Health Model disabled in filter_health_model_builder" - return MultiEventStream.new + return Fluent::MultiEventStream.new end begin - new_es = MultiEventStream.new - time = Time.now + new_es = Fluent::MultiEventStream.new + time = Time.now if tag.start_with?("kubehealth.DaemonSet.Node") node_records = [] @@ -96,7 +100,7 @@ def filter_stream(tag, es) } @buffer.add_to_buffer(node_records) end - return MultiEventStream.new + return Fluent::MultiEventStream.new elsif tag.start_with?("kubehealth.DaemonSet.Container") container_records = [] if !es.nil? @@ -110,7 +114,7 @@ def filter_stream(tag, es) @container_cpu_memory_records = [] #in some clusters, this is null, so initialize it again. end @container_cpu_memory_records.push(*container_records) # push the records for aggregation later - return MultiEventStream.new + return Fluent::MultiEventStream.new elsif tag.start_with?("kubehealth.ReplicaSet") records = [] es.each{|time, record| @@ -218,11 +222,11 @@ def filter_stream(tag, es) @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" - current_time = Time.now - emit_time = current_time.to_f + # for each key in monitor.keys, # get the state from health_monitor_state # generate the record to send + emit_time = Fluent::Engine.now all_monitors.keys.each{|key| record = @provider.get_record(all_monitors[key], state) if record[HealthMonitorRecordFields::MONITOR_ID] == MonitorId::CLUSTER @@ -241,17 +245,12 @@ def filter_stream(tag, es) @cluster_new_state = new_state end end - end - record_wrapper = { - "DataType" => "KUBE_HEALTH_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - new_es.add(emit_time, record_wrapper) + end + new_es.add(emit_time, record) } #emit the stream - router.emit_stream(@@rewrite_tag, new_es) + router.emit_stream(@rewrite_tag, new_es) #initialize monitor_set and model_builder @monitor_set = HealthModel::MonitorSet.new @@ -261,8 +260,8 @@ def filter_stream(tag, es) @cluster_health_state.update_state(@state.to_h) @telemetry.send # return an empty event stream, else the match will throw a NoMethodError - return MultiEventStream.new - elsif tag.start_with?("kubehealth.Signals") + return Fluent::MultiEventStream.new + elsif tag.start_with?(@rewrite_tag) # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream es else @@ -274,6 +273,6 @@ def filter_stream(tag, es) @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" return nil end - end + end end end diff --git a/source/plugins/ruby/filter_inventory2mdm.rb b/source/plugins/ruby/filter_inventory2mdm.rb index 38ccab885..509ac608e 100644 --- a/source/plugins/ruby/filter_inventory2mdm.rb +++ b/source/plugins/ruby/filter_inventory2mdm.rb @@ -2,14 +2,16 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require 'logger' require 'yajl/json_gem' require_relative 'oms_common' require_relative 'CustomMetricsUtils' class Inventory2MdmFilter < Filter - Fluent::Plugin.register_filter('filter_inventory2mdm', self) + Fluent::Plugin.register_filter('inventory2mdm', self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => '/var/opt/microsoft/docker-cimprov/log/filter_inventory2mdm.log' @@ -115,8 +117,8 @@ def process_node_inventory_records(es) es.each{|time,record| begin - timestamp = record['DataItems'][0]['CollectionTime'] - node_status = record['DataItems'][0]['Status'] + timestamp = record['CollectionTime'] + node_status = record['Status'] if node_status.downcase.split(",").include? @@node_status_ready.downcase node_ready_count = node_ready_count+1 else @@ -161,8 +163,8 @@ def process_pod_inventory_records(es) records = [] es.each{|time,record| record_count += 1 - timestamp = record['DataItems'][0]['CollectionTime'] - podUid = record['DataItems'][0]['PodUid'] + timestamp = record['CollectionTime'] + podUid = record['PodUid'] if podUids.key?(podUid) #@log.info "pod with #{podUid} already counted" @@ -170,10 +172,10 @@ def process_pod_inventory_records(es) end podUids[podUid] = true - podPhaseDimValue = record['DataItems'][0]['PodStatus'] - podNamespaceDimValue = record['DataItems'][0]['Namespace'] - podControllerNameDimValue = record['DataItems'][0]['ControllerName'] - podNodeDimValue = record['DataItems'][0]['Computer'] + podPhaseDimValue = record['PodStatus'] + podNamespaceDimValue = record['Namespace'] + podControllerNameDimValue = record['ControllerName'] + podNodeDimValue = record['Computer'] if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? podControllerNameDimValue = 'No Controller' @@ -263,7 +265,7 @@ def process_pod_inventory_records(es) end def filter_stream(tag, es) - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new filtered_records = [] time = DateTime.now begin diff --git a/source/plugins/ruby/filter_telegraf2mdm.rb b/source/plugins/ruby/filter_telegraf2mdm.rb index 88ae428d1..fd71f1682 100644 --- a/source/plugins/ruby/filter_telegraf2mdm.rb +++ b/source/plugins/ruby/filter_telegraf2mdm.rb @@ -2,7 +2,9 @@ # frozen_string_literal: true -module Fluent +require 'fluent/plugin/filter' + +module Fluent::Plugin require "logger" require "yajl/json_gem" require_relative "oms_common" @@ -11,7 +13,7 @@ module Fluent require_relative "constants" class Telegraf2MdmFilter < Filter - Fluent::Plugin.register_filter("filter_telegraf2mdm", self) + Fluent::Plugin.register_filter("telegraf2mdm", self) config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_telegraf2mdm.log" @@ -64,7 +66,7 @@ def filter(tag, time, record) end def filter_stream(tag, es) - new_es = MultiEventStream.new + new_es = Fluent::MultiEventStream.new begin es.each { |time, record| filtered_records = filter(tag, time, record) diff --git a/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb b/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb index 12c72a120..ebf3abd7e 100644 --- a/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb +++ b/source/plugins/ruby/health/health_container_cpu_memory_record_formatter.rb @@ -17,10 +17,10 @@ def initialize def get_record_from_cadvisor_record(cadvisor_record) begin - instance_name = cadvisor_record['DataItems'][0]['InstanceName'] - counter_name = cadvisor_record['DataItems'][0]['Collections'][0]['CounterName'] - metric_value = cadvisor_record['DataItems'][0]['Collections'][0]['Value'] - timestamp = cadvisor_record['DataItems'][0]['Timestamp'] + instance_name = cadvisor_record['InstanceName'] + counter_name = JSON.parse(cadvisor_record['json_Collections'])[0]['CounterName'] + metric_value = JSON.parse(cadvisor_record['json_Collections'])[0]['Value'] + timestamp = cadvisor_record['Timestamp'] health_container_cpu_memory_record = @@health_container_cpu_memory_record_template % { instance_name: instance_name, diff --git a/source/plugins/ruby/health/health_monitor_utils.rb b/source/plugins/ruby/health/health_monitor_utils.rb index c23d8824a..58f2ecc36 100644 --- a/source/plugins/ruby/health/health_monitor_utils.rb +++ b/source/plugins/ruby/health/health_monitor_utils.rb @@ -171,8 +171,9 @@ def get_cluster_cpu_memory_capacity(log, node_inventory: nil) cpu_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "cpu", "cpuCapacityNanoCores") if !cpu_capacity_json.nil? cpu_capacity_json.each do |cpu_capacity_node| - if !cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - cluster_cpu_capacity += cpu_capacity_node['DataItems'][0]['Collections'][0]['Value'] + metricVal = JSON.parse(cpu_capacity_node['json_Collections'])[0]['Value'] + if !metricVal.to_s.nil? + cluster_cpu_capacity += metricVal end end else @@ -181,8 +182,9 @@ def get_cluster_cpu_memory_capacity(log, node_inventory: nil) memory_capacity_json = KubernetesApiClient.parseNodeLimits(node_inventory, "capacity", "memory", "memoryCapacityBytes") if !memory_capacity_json.nil? memory_capacity_json.each do |memory_capacity_node| - if !memory_capacity_node['DataItems'][0]['Collections'][0]['Value'].to_s.nil? - cluster_memory_capacity += memory_capacity_node['DataItems'][0]['Collections'][0]['Value'] + metricVal = JSON.parse(memory_capacity_node['json_Collections'])[0]['Value'] + if !metricVal.to_s.nil? + cluster_memory_capacity += metricVal end end else @@ -284,7 +286,7 @@ def build_metrics_hash(metrics_to_collect) def get_health_monitor_config health_monitor_config = {} begin - file = File.open('/opt/microsoft/omsagent/plugin/healthmonitorconfig.json', "r") + file = File.open('/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json', "r") if !file.nil? fileContents = file.read health_monitor_config = JSON.parse(fileContents) diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index b706ff00a..781042cea 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -1,10 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true +require 'fluent/plugin/input' -module Fluent +module Fluent::Plugin class CAdvisor_Perf_Input < Input - Plugin.register_input("cadvisorperf", self) + Fluent::Plugin.register_input("cadvisor_perf", self) def initialize super @@ -15,14 +16,15 @@ def initialize require_relative "CAdvisorMetricsAPIClient" require_relative "oms_common" require_relative "omslog" - require_relative "constants" + require_relative "constants" end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.api.cadvisorperf" + config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" config_param :nodehealthtag, :string, :default => "kubehealth.DaemonSet.Node" config_param :containerhealthtag, :string, :default => "kubehealth.DaemonSet.Container" + config_param :insightsmetricstag, :string, :default => "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" def configure(conf) super @@ -30,6 +32,7 @@ def configure(conf) def start if @run_interval + super @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -44,24 +47,23 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end def enumerate() currentTime = Time.now - time = currentTime.to_f + time = Fluent::Engine.now batchTime = currentTime.utc.iso8601 @@istestvar = ENV["ISTEST"] begin - eventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) - metricData.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" - eventStream.add(time, record) if record - end - + metricData.each do |record| + eventStream.add(time, record) if record + end + router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream router.emit_stream(@containerhealthtag, eventStream) if eventStream @@ -75,19 +77,13 @@ def enumerate() #start GPU InsightsMetrics items begin containerGPUusageInsightsMetricsDataItems = [] - containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) - + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) @@ -135,6 +131,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # CAdvisor_Perf_Input end # module diff --git a/source/plugins/ruby/in_containerinventory.rb b/source/plugins/ruby/in_containerinventory.rb index c1126aa4e..eebf422d6 100644 --- a/source/plugins/ruby/in_containerinventory.rb +++ b/source/plugins/ruby/in_containerinventory.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Container_Inventory_Input < Input - Plugin.register_input("containerinventory", self) + Fluent::Plugin.register_input("containerinventory", self) @@PluginName = "ContainerInventory" @@ -19,7 +21,7 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.containerinventory" + config_param :tag, :string, :default => "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" def configure(conf) super @@ -27,6 +29,7 @@ def configure(conf) def start if @run_interval + super @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -42,17 +45,18 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end def enumerate - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now batchTime = currentTime.utc.iso8601 + emitTime = Fluent::Engine.now containerInventory = Array.new - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new hostName = "" - $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") + $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") begin containerRuntimeEnv = ENV["CONTAINER_RUNTIME"] $log.info("in_container_inventory::enumerate : container runtime : #{containerRuntimeEnv}") @@ -89,13 +93,8 @@ def enumerate end end end - containerInventory.each do |record| - wrapper = { - "DataType" => "CONTAINER_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + containerInventory.each do |record| + eventStream.add(emitTime, record) if record end router.emit_stream(@tag, eventStream) if eventStream @@istestvar = ENV["ISTEST"] @@ -149,6 +148,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # Container_Inventory_Input end # module diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index f50019a01..6f65dab92 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_Event_Input < Input - Plugin.register_input("kubeevents", self) + Fluent::Plugin.register_input("kube_events", self) @@KubeEventsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml" def initialize @@ -29,14 +31,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubeEvents" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_EVENTS_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 @EVENTS_CHUNK_SIZE = ENV["EVENTS_CHUNK_SIZE"].to_i else @@ -70,6 +73,7 @@ def shutdown @condition.signal } @thread.join + super end end @@ -80,8 +84,8 @@ def enumerate batchTime = currentTime.utc.iso8601 eventQueryState = getEventQueryState newEventQueryState = [] - @eventsCount = 0 - + @eventsCount = 0 + # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") @@ -127,11 +131,11 @@ def enumerate end # end enumerate def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now @@istestvar = ENV["ISTEST"] begin - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new events["items"].each do |items| record = {} # - Not sure if ingestion has the below mapping for this custom type. Fix it as part of fixed type conversion @@ -162,13 +166,8 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim record["Count"] = items["count"] record["Computer"] = nodeName record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - wrapper = { - "DataType" => "KUBE_EVENTS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + record["ClusterId"] = KubernetesApiClient.getClusterId + eventStream.add(emitTime, record) if record @eventsCount += 1 end router.emit_stream(@tag, eventStream) if eventStream diff --git a/source/plugins/ruby/in_kube_health.rb b/source/plugins/ruby/in_kube_health.rb index 874be26f6..db981c53e 100644 --- a/source/plugins/ruby/in_kube_health.rb +++ b/source/plugins/ruby/in_kube_health.rb @@ -1,17 +1,19 @@ #!/usr/local/bin/ruby # frozen_string_literal: true +require 'fluent/plugin/input' + require_relative "KubernetesApiClient" require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" -module Fluent +module Fluent::Plugin Dir[File.join(__dir__, "./health", "*.rb")].each { |file| require file } class KubeHealthInput < Input include HealthModel - Plugin.register_input("kubehealth", self) + Fluent::Plugin.register_input("kube_health", self) config_param :health_monitor_config_path, :default => "/etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json" @@ -46,6 +48,7 @@ def configure(conf) def start begin + super if @run_interval @finished = false @condition = ConditionVariable.new @@ -76,20 +79,21 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end def enumerate if !@@cluster_health_model_enabled @@hmlog.info "Cluster Health Model disabled in in_kube_health" - return MultiEventStream.new + return Fluent::MultiEventStream.new end begin - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now batchTime = currentTime.utc.iso8601 health_monitor_records = [] - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new #HealthMonitorUtils.refresh_kubernetes_api_data(@@hmlog, nil) # we do this so that if the call fails, we get a response code/header etc. diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 99e804302..ffc11de55 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -1,17 +1,17 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent - class Kube_nodeInventory_Input < Input - Plugin.register_input("kubenodeinventory", self) +require 'fluent/plugin/input' - @@ContainerNodeInventoryTag = "oms.containerinsights.ContainerNodeInventory" - @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" +module Fluent::Plugin + class Kube_nodeInventory_Input < Input + Fluent::Plugin.register_input("kube_nodes", self) + @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - @@kubeperfTag = "oms.api.KubePerf" + @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@ -35,7 +35,13 @@ def initialize require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "oms_common" - require_relative "omslog" + require_relative "omslog" + + @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" + # refer tomlparser-agent-config for the defaults @NODES_CHUNK_SIZE = 0 @NODES_EMIT_STREAM_BATCH_SIZE = 0 @@ -48,14 +54,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubeNodeInventory" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i else @@ -90,6 +97,7 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end @@ -101,8 +109,10 @@ def enumerate @nodesAPIE2ELatencyMs = 0 @nodeInventoryE2EProcessingLatencyMs = 0 - nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i + # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") @@ -151,49 +161,38 @@ def enumerate def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) begin - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now telemetrySent = false - eventStream = MultiEventStream.new - containerNodeInventoryEventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new - kubePerfEventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new + containerNodeInventoryEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] #get node inventory nodeInventory["items"].each do |item| # node inventory nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) - wrapper = { - "DataType" => "KUBE_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [nodeInventoryRecord.each { |k, v| nodeInventoryRecord[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream - + router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new end # container node inventory - containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) - containerNodeInventoryWrapper = { - "DataType" => "CONTAINER_NODE_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [containerNodeInventoryRecord.each { |k, v| containerNodeInventoryRecord[k] = v }], - } - containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryWrapper) if containerNodeInventoryWrapper + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) + containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream - containerNodeInventoryEventStream = MultiEventStream.new + router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + containerNodeInventoryEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -223,7 +222,8 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeMetricRecords.push(nodeMetricRecord) # add data to the cache so filter_cadvisor2mdm.rb can use it if is_windows_node - @NodeCache.cpu.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + metricVal = JSON.parse(nodeMetricRecord["json_Collections"])[0]["Value"] + @NodeCache.cpu.set_capacity(nodeMetricRecord["Host"], metricVal) end end nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) @@ -231,18 +231,17 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeMetricRecords.push(nodeMetricRecord) # add data to the cache so filter_cadvisor2mdm.rb can use it if is_windows_node - @NodeCache.mem.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + metricVal = JSON.parse(nodeMetricRecord["json_Collections"])[0]["Value"] + @NodeCache.mem.set_capacity(nodeMetricRecord["Host"], metricVal) end end - nodeMetricRecords.each do |metricRecord| - metricRecord["DataType"] = "LINUX_PERF_BLOB" - metricRecord["IPName"] = "LogManagement" + nodeMetricRecords.each do |metricRecord| kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream - kubePerfEventStream = MultiEventStream.new + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + kubePerfEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodePerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -266,18 +265,13 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) end - nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - insightsMetricsEventStream = MultiEventStream.new + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -337,15 +331,15 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@MDMKubeNodeInventoryTag, eventStream) if eventStream + router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - eventStream = nil + eventStream = nil end if containerNodeInventoryEventStream.count > 0 $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream + router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("containerNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -354,7 +348,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if kubePerfEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodePerfInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -362,7 +356,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end if insightsMetricsEventStream.count > 0 $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -513,10 +507,8 @@ def getNodeTelemetryProps(item) $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end return properties - end + end end # Kube_Node_Input - - class NodeStatsCache # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) # (to reduce code duplication) @@ -586,6 +578,5 @@ def cpu() def mem() return @@memCache end - end - + end end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 5256eb159..5598602cd 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1,16 +1,17 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin require_relative "podinventory_to_mdm" class Kube_PodInventory_Input < Input - Plugin.register_input("kubepodinventory", self) + Fluent::Plugin.register_input("kube_podinventory", self) @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) - @@kubeperfTag = "oms.api.KubePerf" - @@kubeservicesTag = "oms.containerinsights.KubeServices" + def initialize super @@ -38,19 +39,25 @@ def initialize @winContainerCount = 0 @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 - @podsAPIE2ELatencyMs = 0 + @podsAPIE2ELatencyMs = 0 + + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" + @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" + @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubePodInventory" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_POD_INVENTORY_BLOB" def configure(conf) super @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end - def start + def start if @run_interval + super if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 @PODS_CHUNK_SIZE = ENV["PODS_CHUNK_SIZE"].to_i else @@ -58,7 +65,7 @@ def start $log.warn("in_kube_podinventory::start: setting to default value since got PODS_CHUNK_SIZE nil or empty") @PODS_CHUNK_SIZE = 1000 end - $log.info("in_kube_podinventory::start : PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") + $log.info("in_kube_podinventory::start: PODS_CHUNK_SIZE @ #{@PODS_CHUNK_SIZE}") if !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["PODS_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i > 0 @PODS_EMIT_STREAM_BATCH_SIZE = ENV["PODS_EMIT_STREAM_BATCH_SIZE"].to_i @@ -67,8 +74,7 @@ def start $log.warn("in_kube_podinventory::start: setting to default value since got PODS_EMIT_STREAM_BATCH_SIZE nil or empty") @PODS_EMIT_STREAM_BATCH_SIZE = 200 end - $log.info("in_kube_podinventory::start : PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") - + $log.info("in_kube_podinventory::start: PODS_EMIT_STREAM_BATCH_SIZE @ #{@PODS_EMIT_STREAM_BATCH_SIZE}") @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -84,6 +90,7 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end @@ -100,7 +107,8 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 - podInventoryStartTime = (Time.now.to_f * 1000).to_i + podInventoryStartTime = (Time.now.to_f * 1000).to_i + # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") serviceInfo = KubernetesApiClient.getKubeResourceInfo("services") @@ -189,12 +197,13 @@ def enumerate(podList = nil) end def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = currentTime.to_f + currentTime = Time.now + emitTime = Fluent::Engine.now #batchTime = currentTime.utc.iso8601 - eventStream = MultiEventStream.new - kubePerfEventStream = MultiEventStream.new - insightsMetricsEventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new + containerInventoryStream = Fluent::MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] begin #begin block start @@ -205,13 +214,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) podInventoryRecords.each do |record| if !record.nil? - wrapper = { - "DataType" => "KUBE_POD_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper - @inventoryToMdmConvertor.process_pod_inventory_record(wrapper) + eventStream.add(emitTime, record) if record + @inventoryToMdmConvertor.process_pod_inventory_record(record) end end # Setting this flag to true so that we can send ContainerInventory records for containers @@ -228,13 +232,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc # Send container inventory records for containers on windows nodes @winContainerCount += containerInventoryRecords.length containerInventoryRecords.each do |cirecord| - if !cirecord.nil? - ciwrapper = { - "DataType" => "CONTAINER_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [cirecord.each { |k, v| cirecord[k] = v }], - } - eventStream.add(emitTime, ciwrapper) if ciwrapper + if !cirecord.nil? + containerInventoryStream.add(emitTime, cirecord) if cirecord end end end @@ -246,7 +245,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end router.emit_stream(@tag, eventStream) if eventStream - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new end #container perf records @@ -256,19 +255,17 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - containerMetricDataItems.each do |record| - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" + containerMetricDataItems.each do |record| kubePerfEventStream.add(emitTime, record) if record end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - kubePerfEventStream = MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new end # container GPU records @@ -277,13 +274,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(emitTime, wrapper) if wrapper + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE @@ -291,8 +283,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream - insightsMetricsEventStream = MultiEventStream.new + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream + insightsMetricsEventStream = Fluent::MultiEventStream.new end end #podInventory block end @@ -305,9 +297,18 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc eventStream = nil end + if containerInventoryStream.count > 0 + $log.info("in_kube_podinventory::parse_and_emit_records: number of windows container inventory records emitted #{containerInventoryStream.count} @ #{Time.now.utc.iso8601}") + router.emit_stream(@containerInventoryTag, containerInventoryStream) if containerInventoryStream + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) + $log.info("kubeWindowsContainerInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end + containerInventoryStream = nil + end + if kubePerfEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records: number of perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeperfTag, kubePerfEventStream) if kubePerfEventStream + router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = nil if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -316,7 +317,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if insightsMetricsEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records: number of insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -327,7 +328,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc @log.info "Sending pod inventory mdm records to out_mdm" pod_inventory_mdm_records = @inventoryToMdmConvertor.get_pod_inventory_mdm_records(batchTime) @log.info "pod_inventory_mdm_records.size #{pod_inventory_mdm_records.size}" - mdm_pod_inventory_es = MultiEventStream.new + mdm_pod_inventory_es = Fluent::MultiEventStream.new pod_inventory_mdm_records.each { |pod_inventory_mdm_record| mdm_pod_inventory_es.add(batchTime, pod_inventory_mdm_record) if pod_inventory_mdm_record } if pod_inventory_mdm_records @@ -335,22 +336,17 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if continuationToken.nil? # sending kube services inventory records - kubeServicesEventStream = MultiEventStream.new + kubeServicesEventStream = Fluent::MultiEventStream.new serviceRecords.each do |kubeServiceRecord| if !kubeServiceRecord.nil? # adding before emit to reduce memory foot print kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId - kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName - kubeServicewrapper = { - "DataType" => "KUBE_SERVICES_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [kubeServiceRecord.each { |k, v| kubeServiceRecord[k] = v }], - } - kubeServicesEventStream.add(emitTime, kubeServicewrapper) if kubeServicewrapper + kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServicesEventStream.add(emitTime, kubeServiceRecord) if kubeServiceRecord if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream - kubeServicesEventStream = MultiEventStream.new + router.emit_stream(@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + kubeServicesEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -360,7 +356,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if kubeServicesEventStream.count > 0 $log.info("in_kube_podinventory::parse_and_emit_records : number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") - router.emit_stream(@@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream + router.emit_stream(@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeServicesEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -652,6 +648,6 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName - end + end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 4efe86f61..40eebac8a 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -1,6 +1,11 @@ -module Fluent +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_PVInventory_Input < Input - Plugin.register_input("kubepvinventory", self) + Fluent::Plugin.register_input("kube_pvinventory", self) @@hostName = (OMS::Common.get_hostname) @@ -22,14 +27,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.containerinsights.KubePVInventory" + config_param :tag, :string, :default => "oneagent.containerInsights.KUBE_PV_INVENTORY_BLOB" def configure(conf) super end - def start + def start if @run_interval + super @finished = false @condition = ConditionVariable.new @mutex = Mutex.new @@ -45,6 +51,7 @@ def shutdown @condition.signal } @thread.join + super end end @@ -54,7 +61,7 @@ def enumerate telemetryFlush = false @pvTypeToCountHash = {} currentTime = Time.now - batchTime = currentTime.utc.iso8601 + batchTime = currentTime.utc.iso8601 continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") @@ -103,9 +110,9 @@ def enumerate end # end enumerate def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = currentTime.to_f - eventStream = MultiEventStream.new + currentTime = Time.now + emitTime = Fluent::Engine.now + eventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] begin records = [] @@ -145,13 +152,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end records.each do |record| - if !record.nil? - wrapper = { - "DataType" => "KUBE_PV_INVENTORY_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [record.each { |k, v| record[k] = v }], - } - eventStream.add(emitTime, wrapper) if wrapper + if !record.nil? + eventStream.add(emitTime, record) end end @@ -250,7 +252,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end - + end end # Kube_PVInventory_Input -end # module \ No newline at end of file +end # module diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index 27e4709a2..182c3ffc1 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_Kubestate_Deployments_Input < Input - Plugin.register_input("kubestatedeployments", self) + Fluent::Plugin.register_input("kubestate_deployments", self) @@istestvar = ENV["ISTEST"] # telemetry - To keep telemetry cost reasonable, we keep track of the max deployments over a period of 15m @@deploymentsCount = 0 @@ -36,14 +38,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + config_param :tag, :string, :default => "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 @DEPLOYMENTS_CHUNK_SIZE = ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i else @@ -52,11 +55,11 @@ def start @DEPLOYMENTS_CHUNK_SIZE = 500 end $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") - + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + @thread = Thread.new(&method(:run_periodic)) end end @@ -67,6 +70,7 @@ def shutdown @condition.signal } @thread.join + super # This super must be at the end of shutdown method end end @@ -77,8 +81,8 @@ def enumerate batchTime = currentTime.utc.iso8601 #set the running total for this batch to 0 - @deploymentsRunningTotal = 0 - + @deploymentsRunningTotal = 0 + # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") @@ -126,7 +130,7 @@ def enumerate def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) metricItems = [] - insightsMetricsEventStream = MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new begin metricInfo = deployments metricInfo["items"].each do |deployment| @@ -181,17 +185,12 @@ def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) metricItems.push(metricItem) end - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + time = Fluent::Engine.now + metricItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@tag, insightsMetricsEventStream) if insightsMetricsEventStream $log.info("successfully emitted #{metricItems.length()} kube_state_deployment metrics") @deploymentsRunningTotal = @deploymentsRunningTotal + metricItems.length() @@ -234,6 +233,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index afecf8e3b..8f60bfb72 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Kube_Kubestate_HPA_Input < Input - Plugin.register_input("kubestatehpa", self) + Fluent::Plugin.register_input("kubestate_hpa", self) @@istestvar = ENV["ISTEST"] def initialize @@ -16,7 +18,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" - require_relative "constants" + require_relative "constants" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -33,14 +35,15 @@ def initialize end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => Constants::INSIGHTSMETRICS_FLUENT_TAG + config_param :tag, :string, :default => "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" def configure(conf) super end - def start + def start if @run_interval + super if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 @HPA_CHUNK_SIZE = ENV["HPA_CHUNK_SIZE"].to_i else @@ -64,6 +67,7 @@ def shutdown @condition.signal } @thread.join + super end end @@ -74,7 +78,7 @@ def enumerate batchTime = currentTime.utc.iso8601 @hpaCount = 0 - + # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") @@ -113,7 +117,7 @@ def enumerate def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) metricItems = [] - insightsMetricsEventStream = MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new begin metricInfo = hpas metricInfo["items"].each do |hpa| @@ -181,17 +185,12 @@ def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) metricItems.push(metricItem) end - time = Time.now.to_f - metricItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + time = Fluent::Engine.now + metricItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@tag, insightsMetricsEventStream) if insightsMetricsEventStream $log.info("successfully emitted #{metricItems.length()} kube_state_hpa metrics") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("kubestatehpaInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -232,6 +231,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 9c267cf4f..61e823ea6 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -1,9 +1,11 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent +require 'fluent/plugin/input' + +module Fluent::Plugin class Win_CAdvisor_Perf_Input < Input - Plugin.register_input("wincadvisorperf", self) + Fluent::Plugin.register_input("win_cadvisor_perf", self) @@winNodes = [] @@ -18,10 +20,11 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 - config_param :tag, :string, :default => "oms.api.wincadvisorperf" + config_param :tag, :string, :default => "oneagent.containerInsights.LINUX_PERF_BLOB" config_param :mdmtag, :string, :default => "mdm.cadvisorperf" def configure(conf) @@ -50,11 +53,11 @@ def shutdown end def enumerate() - time = Time.now.to_f + time = Fluent::Engine.now begin timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - @@istestvar = ENV["ISTEST"] + @@istestvar = ENV["ISTEST"] #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() @@ -68,12 +71,10 @@ def enumerate() @@winNodeQueryTimeTracker = DateTime.now.to_time.to_i end @@winNodes.each do |winNode| - eventStream = MultiEventStream.new + eventStream = Fluent::MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601) metricData.each do |record| if !record.empty? - record["DataType"] = "LINUX_PERF_BLOB" - record["IPName"] = "LogManagement" eventStream.add(time, record) if record end end @@ -88,18 +89,13 @@ def enumerate() begin containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) - insightsMetricsEventStream = MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", - "IPName" => "ContainerInsights", - "DataItems" => [insightsMetricsRecord.each { |k, v| insightsMetricsRecord[k] = v }], - } - insightsMetricsEventStream.add(time, wrapper) if wrapper + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/out_health_forward.rb b/source/plugins/ruby/out_health_forward.rb index 6fcfe368b..59eed97da 100644 --- a/source/plugins/ruby/out_health_forward.rb +++ b/source/plugins/ruby/out_health_forward.rb @@ -15,469 +15,593 @@ # limitations under the License. # -require 'base64' -require 'socket' -require 'fileutils' - -require 'cool.io' - require 'fluent/output' require 'fluent/config/error' +require 'fluent/clock' +require 'fluent/tls' +require 'base64' +require 'forwardable' -module Fluent - class ForwardOutputError < StandardError - end - - class ForwardOutputResponseError < ForwardOutputError - end +require 'fluent/compat/socket_util' +require 'fluent/plugin/out_forward/handshake_protocol' +require 'fluent/plugin/out_forward/load_balancer' +require 'fluent/plugin/out_forward/socket_cache' +require 'fluent/plugin/out_forward/failure_detector' +require 'fluent/plugin/out_forward/error' +require 'fluent/plugin/out_forward/connection_manager' +require 'fluent/plugin/out_forward/ack_handler' - class ForwardOutputConnectionClosedError < ForwardOutputError - end +module Fluent::Plugin + class HealthForwardOutput < Output + Fluent::Plugin.register_output('health_forward', self) - class ForwardOutputACKTimeoutError < ForwardOutputResponseError - end + helpers :socket, :server, :timer, :thread, :compat_parameters, :service_discovery - class HealthForwardOutput < ObjectBufferedOutput - Plugin.register_output('health_forward', self) + LISTEN_PORT = 25227 - def initialize - super - require 'fluent/plugin/socket_util' - @nodes = [] #=> [Node] - end + desc 'The transport protocol.' + config_param :transport, :enum, list: [:tcp, :tls], default: :tcp + # TODO: TLS session cache/tickets desc 'The timeout time when sending event logs.' config_param :send_timeout, :time, default: 60 - desc 'The transport protocol to use for heartbeats.(udp,tcp,none)' - config_param :heartbeat_type, default: :udp do |val| - case val.downcase - when 'tcp' - :tcp - when 'udp' - :udp - when 'none' - :none - else - raise ConfigError, "forward output heartbeat type should be 'tcp', 'udp', or 'none'" - end - end + desc 'The timeout time for socket connect' + config_param :connect_timeout, :time, default: nil + # TODO: add linger_timeout, recv_timeout + + desc 'The protocol to use for heartbeats (default is the same with "transport").' + config_param :heartbeat_type, :enum, list: [:transport, :tcp, :udp, :none], default: :transport desc 'The interval of the heartbeat packer.' config_param :heartbeat_interval, :time, default: 1 desc 'The wait time before accepting a server fault recovery.' config_param :recover_wait, :time, default: 10 desc 'The hard timeout used to detect server failure.' config_param :hard_timeout, :time, default: 60 - desc 'Set TTL to expire DNS cache in seconds.' - config_param :expire_dns_cache, :time, default: nil # 0 means disable cache desc 'The threshold parameter used to detect server faults.' config_param :phi_threshold, :integer, default: 16 desc 'Use the "Phi accrual failure detector" to detect server failure.' config_param :phi_failure_detector, :bool, default: true - # if any options added that requires extended forward api, fix @extend_internal_protocol - desc 'Change the protocol to at-least-once.' config_param :require_ack_response, :bool, default: false # require in_forward to respond with ack - desc 'This option is used when require_ack_response is true.' - config_param :ack_response_timeout, :time, default: 190 # 0 means do not wait for ack responses + + ## The reason of default value of :ack_response_timeout: # Linux default tcp_syn_retries is 5 (in many environment) # 3 + 6 + 12 + 24 + 48 + 96 -> 189 (sec) + desc 'This option is used when require_ack_response is true.' + config_param :ack_response_timeout, :time, default: 190 + + desc 'The interval while reading data from server' + config_param :read_interval_msec, :integer, default: 50 # 50ms + desc 'Reading data size from server' + config_param :read_length, :size, default: 512 # 512bytes + + desc 'Set TTL to expire DNS cache in seconds.' + config_param :expire_dns_cache, :time, default: nil # 0 means disable cache desc 'Enable client-side DNS round robin.' config_param :dns_round_robin, :bool, default: false # heartbeat_type 'udp' is not available for this + desc 'Ignore DNS resolution and errors at startup time.' + config_param :ignore_network_errors_at_startup, :bool, default: false + + desc 'Verify that a connection can be made with one of out_forward nodes at the time of startup.' + config_param :verify_connection_at_startup, :bool, default: false + + desc 'Compress buffered data.' + config_param :compress, :enum, list: [:text, :gzip], default: :text + + desc 'The default version of TLS transport.' + config_param :tls_version, :enum, list: Fluent::TLS::SUPPORTED_VERSIONS, default: Fluent::TLS::DEFAULT_VERSION + desc 'The cipher configuration of TLS transport.' + config_param :tls_ciphers, :string, default: Fluent::TLS::CIPHERS_DEFAULT + desc 'Skip all verification of certificates or not.' + config_param :tls_insecure_mode, :bool, default: false + desc 'Allow self signed certificates or not.' + config_param :tls_allow_self_signed_cert, :bool, default: false + desc 'Verify hostname of servers and certificates or not in TLS transport.' + config_param :tls_verify_hostname, :bool, default: true + desc 'The additional CA certificate path for TLS.' + config_param :tls_ca_cert_path, :array, value_type: :string, default: nil + desc 'The additional certificate path for TLS.' + config_param :tls_cert_path, :array, value_type: :string, default: nil + desc 'The client certificate path for TLS.' + config_param :tls_client_cert_path, :string, default: nil + desc 'The client private key path for TLS.' + config_param :tls_client_private_key_path, :string, default: nil + desc 'The client private key passphrase for TLS.' + config_param :tls_client_private_key_passphrase, :string, default: nil, secret: true + desc 'The certificate thumbprint for searching from Windows system certstore.' + config_param :tls_cert_thumbprint, :string, default: nil, secret: true + desc 'The certificate logical store name on Windows system certstore.' + config_param :tls_cert_logical_store_name, :string, default: nil + desc 'Enable to use certificate enterprise store on Windows system certstore.' + config_param :tls_cert_use_enterprise_store, :bool, default: true + desc "Enable keepalive connection." + config_param :keepalive, :bool, default: false + desc "Expired time of keepalive. Default value is nil, which means to keep connection as long as possible" + config_param :keepalive_timeout, :time, default: nil + + config_section :security, required: false, multi: false do + desc 'The hostname' + config_param :self_hostname, :string + desc 'Shared key for authentication' + config_param :shared_key, :string, secret: true + end + + config_section :server, param_name: :servers do + desc "The IP address or host name of the server." + config_param :host, :string + desc "The name of the server. Used for logging and certificate verification in TLS transport (when host is address)." + config_param :name, :string, default: nil + desc "The port number of the host." + config_param :port, :integer, default: LISTEN_PORT + desc "The shared key per server." + config_param :shared_key, :string, default: nil, secret: true + desc "The username for authentication." + config_param :username, :string, default: '' + desc "The password for authentication." + config_param :password, :string, default: '', secret: true + desc "Marks a node as the standby node for an Active-Standby model between Fluentd nodes." + config_param :standby, :bool, default: false + desc "The load balancing weight." + config_param :weight, :integer, default: 60 + end + attr_reader :nodes - config_param :port, :integer, default: DEFAULT_LISTEN_PORT, deprecated: "User host xxx instead." - config_param :host, :string, default: nil, deprecated: "Use port xxx instead." - desc 'Skip network related error, e.g. DNS error, during plugin setup' - config_param :skip_network_error_at_init, :bool, :default => false + config_param :port, :integer, default: LISTEN_PORT, obsoleted: "User section instead." + config_param :host, :string, default: nil, obsoleted: "Use section instead." + config_section :buffer do + config_set_default :chunk_keys, ["tag"] + end - attr_accessor :extend_internal_protocol + attr_reader :read_interval, :recover_sample_size - def configure(conf) + def initialize super - # backward compatibility - if host = conf['host'] - port = conf['port'] - port = port ? port.to_i : DEFAULT_LISTEN_PORT - e = conf.add_element('server') - e['host'] = host - e['port'] = port.to_s - end + @nodes = [] #=> [Node] + @loop = nil + @thread = nil - recover_sample_size = @recover_wait / @heartbeat_interval + @usock = nil + @keep_alive_watcher_interval = 5 # TODO + @suspend_flush = false + end - # add options here if any options addes which uses extended protocol - @extend_internal_protocol = if @require_ack_response - true - else - false - end + def configure(conf) + compat_parameters_convert(conf, :buffer, default_chunk_key: 'tag') - if @dns_round_robin - if @heartbeat_type == :udp - raise ConfigError, "forward output heartbeat type must be 'tcp' or 'none' to use dns_round_robin option" - end - end + super - conf.elements.each {|e| - next if e.name != "server" + unless @chunk_key_tag + raise Fluent::ConfigError, "buffer chunk key must include 'tag' for forward output" + end - host = e['host'] - port = e['port'] - port = port ? port.to_i : DEFAULT_LISTEN_PORT + @read_interval = @read_interval_msec / 1000.0 + @recover_sample_size = @recover_wait / @heartbeat_interval - weight = e['weight'] - weight = weight ? weight.to_i : 60 + if @heartbeat_type == :tcp + log.warn "'heartbeat_type tcp' is deprecated. use 'transport' instead." + @heartbeat_type = :transport + end - standby = !!e['standby'] + if @dns_round_robin && @heartbeat_type == :udp + raise Fluent::ConfigError, "forward output heartbeat type must be 'transport' or 'none' to use dns_round_robin option" + end - name = e['name'] - unless name - name = "#{host}:#{port}" + if @transport == :tls + # socket helper adds CA cert or signed certificate to same cert store internally so unify it in this place. + if @tls_cert_path && !@tls_cert_path.empty? + @tls_ca_cert_path = @tls_cert_path + end + if @tls_ca_cert_path && !@tls_ca_cert_path.empty? + @tls_ca_cert_path.each do |path| + raise Fluent::ConfigError, "specified cert path does not exist:#{path}" unless File.exist?(path) + raise Fluent::ConfigError, "specified cert path is not readable:#{path}" unless File.readable?(path) + end end - failure = FailureDetector.new(@heartbeat_interval, @hard_timeout, Time.now.to_i.to_f) - - node_conf = NodeConfig2.new(name, host, port, weight, standby, failure, - @phi_threshold, recover_sample_size, @expire_dns_cache, @phi_failure_detector, @dns_round_robin, @skip_network_error_at_init) + if @tls_insecure_mode + log.warn "TLS transport is configured in insecure way" + @tls_verify_hostname = false + @tls_allow_self_signed_cert = true + end - if @heartbeat_type == :none - @nodes << NoneHeartbeatNode.new(log, node_conf) + if Fluent.windows? + if (@tls_cert_path || @tls_ca_cert_path) && @tls_cert_logical_store_name + raise Fluent::ConfigError, "specified both cert path and tls_cert_logical_store_name is not permitted" + end else - @nodes << Node.new(log, node_conf) + raise Fluent::ConfigError, "This parameter is for only Windows" if @tls_cert_logical_store_name + raise Fluent::ConfigError, "This parameter is for only Windows" if @tls_cert_thumbprint end - log.info "adding forwarding server '#{name}'", host: host, port: port, weight: weight, plugin_id: plugin_id - } + end + + @ack_handler = @require_ack_response ? AckHandler.new(timeout: @ack_response_timeout, log: @log, read_length: @read_length) : nil + socket_cache = @keepalive ? SocketCache.new(@keepalive_timeout, @log) : nil + @connection_manager = Fluent::Plugin::ForwardOutput::ConnectionManager.new( + log: @log, + secure: !!@security, + connection_factory: method(:create_transfer_socket), + socket_cache: socket_cache, + ) - if @nodes.empty? - raise ConfigError, "forward output plugin requires at least one is required" + configs = [] + + # rewrite for using server as sd_static + conf.elements(name: 'server').each do |s| + s.name = 'service' end - end - def start - super + unless conf.elements(name: 'service').empty? + # To copy `services` element only + new_elem = Fluent::Config::Element.new('static_service_discovery', {}, {}, conf.elements(name: 'service')) + configs << { type: :static, conf: new_elem } + end - @rand_seed = Random.new.seed - rebuild_weight_array - @rr = 0 + conf.elements(name: 'service_discovery').each_with_index do |c, i| + configs << { type: @service_discovery[i][:@type], conf: c } + end - unless @heartbeat_type == :none - @loop = Coolio::Loop.new + service_discovery_create_manager( + :out_forward_service_discovery_watcher, + configurations: configs, + load_balancer: Fluent::Plugin::ForwardOutput::LoadBalancer.new(log), + custom_build_method: method(:build_node), + ) - if @heartbeat_type == :udp - # assuming all hosts use udp - @usock = SocketUtil.create_udp_socket(@nodes.first.host) - @usock.fcntl(Fcntl::F_SETFL, Fcntl::O_NONBLOCK) - @hb = HeartbeatHandler.new(@usock, method(:on_heartbeat)) - @loop.attach(@hb) + discovery_manager.services.each do |server| + # it's only for test + @nodes << server + unless @heartbeat_type == :none + begin + server.validate_host_resolution! + rescue => e + raise unless @ignore_network_errors_at_startup + log.warn "failed to resolve node name when configured", server: (server.name || server.host), error: e + server.disable! + end end + end - @timer = HeartbeatRequestTimer.new(@heartbeat_interval, method(:on_timer)) - @loop.attach(@timer) + unless @as_secondary + if @compress == :gzip && @buffer.compress == :text + @buffer.compress = :gzip + elsif @compress == :text && @buffer.compress == :gzip + log.info "buffer is compressed. If you also want to save the bandwidth of a network, Add `compress` configuration in " + end + end - @thread = Thread.new(&method(:run)) + if discovery_manager.services.empty? + raise Fluent::ConfigError, "forward output plugin requires at least one node is required. Add or " end - end - def shutdown - @finished = true - if @loop - @loop.watchers.each {|w| w.detach } - @loop.stop + if !@keepalive && @keepalive_timeout + log.warn('The value of keepalive_timeout is ignored. if you want to use keepalive, please add `keepalive true` to your conf.') end - @thread.join if @thread - @usock.close if @usock + + raise Fluent::ConfigError, "ack_response_timeout must be a positive integer" if @ack_response_timeout < 1 end - def run - @loop.run if @loop - rescue - log.error "unexpected error", error: $!.to_s - log.error_backtrace + def multi_workers_ready? + true end - def write_objects(tag, chunk) - return if chunk.empty? + def prefer_delayed_commit + @require_ack_response + end - error = nil + def overwrite_delayed_commit_timeout + # Output#start sets @delayed_commit_timeout by @buffer_config.delayed_commit_timeout + # But it should be overwritten by ack_response_timeout to rollback chunks after timeout + if @delayed_commit_timeout != @ack_response_timeout + log.info "delayed_commit_timeout is overwritten by ack_response_timeout" + @delayed_commit_timeout = @ack_response_timeout + 2 # minimum ack_reader IO.select interval is 1s + end + end - wlen = @weight_array.length - wlen.times do - @rr = (@rr + 1) % wlen - node = @weight_array[@rr] + def start + super - if node.available? + unless @heartbeat_type == :none + if @heartbeat_type == :udp + @usock = socket_create_udp(discovery_manager.services.first.host, discovery_manager.services.first.port, nonblock: true) + server_create_udp(:out_forward_heartbeat_receiver, 0, socket: @usock, max_bytes: @read_length, &method(:on_udp_heatbeat_response_recv)) + end + timer_execute(:out_forward_heartbeat_request, @heartbeat_interval, &method(:on_heartbeat_timer)) + end + + if @require_ack_response + overwrite_delayed_commit_timeout + thread_create(:out_forward_receiving_ack, &method(:ack_reader)) + end + + if @verify_connection_at_startup + discovery_manager.services.each do |node| begin - send_data(node, tag, chunk) - return - rescue - # for load balancing during detecting crashed servers - error = $! # use the latest error + node.verify_connection + rescue StandardError => e + log.fatal "forward's connection setting error: #{e.message}" + raise Fluent::UnrecoverableError, e.message end end end - if error - raise error - else - raise "no nodes are available" # TODO message + if @keepalive + timer_execute(:out_forward_keep_alived_socket_watcher, @keep_alive_watcher_interval, &method(:on_purge_obsolete_socks)) end end - private + def close + if @usock + # close socket and ignore errors: this socket will not be used anyway. + @usock.close rescue nil + end - def rebuild_weight_array - standby_nodes, regular_nodes = @nodes.partition {|n| - n.standby? - } + super + end - lost_weight = 0 - regular_nodes.each {|n| - unless n.available? - lost_weight += n.weight - end - } - log.debug "rebuilding weight array", lost_weight: lost_weight - - if lost_weight > 0 - standby_nodes.each {|n| - if n.available? - regular_nodes << n - log.warn "using standby node #{n.host}:#{n.port}", weight: n.weight - lost_weight -= n.weight - break if lost_weight <= 0 - end - } + def stop + super + + if @keepalive + @connection_manager.stop end + end + + def before_shutdown + super + @suspend_flush = true + end + + def after_shutdown + last_ack if @require_ack_response + super + end - weight_array = [] - gcd = regular_nodes.map {|n| n.weight }.inject(0) {|r,w| r.gcd(w) } - regular_nodes.each {|n| - (n.weight / gcd).times { - weight_array << n - } - } + def try_flush + return if @require_ack_response && @suspend_flush + super + end - # for load balancing during detecting crashed servers - coe = (regular_nodes.size * 6) / weight_array.size - weight_array *= coe if coe > 1 + def last_ack + overwrite_delayed_commit_timeout + ack_check(ack_select_interval) + end - r = Random.new(@rand_seed) - weight_array.sort_by! { r.rand } + def write(chunk) + return if chunk.empty? + tag = chunk.metadata.tag - @weight_array = weight_array + discovery_manager.select_service { |node| node.send_data(tag, chunk) } end - # MessagePack FixArray length = 3 (if @extend_internal_protocol) - # = 2 (else) - FORWARD_HEADER = [0x92].pack('C').freeze - FORWARD_HEADER_EXT = [0x93].pack('C').freeze - def forward_header - if @extend_internal_protocol - FORWARD_HEADER_EXT - else - FORWARD_HEADER + def try_write(chunk) + log.trace "writing a chunk to destination", chunk_id: dump_unique_id_hex(chunk.unique_id) + if chunk.empty? + commit_write(chunk.unique_id) + return end + tag = chunk.metadata.tag + discovery_manager.select_service { |node| node.send_data(tag, chunk) } + last_ack if @require_ack_response && @suspend_flush end - #FORWARD_TCP_HEARTBEAT_DATA = FORWARD_HEADER + ''.to_msgpack + [].to_msgpack - def send_heartbeat_tcp(node) - sock = connect(node) - begin - opt = [1, @send_timeout.to_i].pack('I!I!') # { int l_onoff; int l_linger; } - sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_LINGER, opt) - opt = [@send_timeout.to_i, 0].pack('L!L!') # struct timeval - # don't send any data to not cause a compatibility problem - #sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, opt) - #sock.write FORWARD_TCP_HEARTBEAT_DATA - node.heartbeat(true) - ensure - sock.close + def create_transfer_socket(host, port, hostname, &block) + case @transport + when :tls + socket_create_tls( + host, port, + version: @tls_version, + ciphers: @tls_ciphers, + insecure: @tls_insecure_mode, + verify_fqdn: @tls_verify_hostname, + fqdn: hostname, + allow_self_signed_cert: @tls_allow_self_signed_cert, + cert_paths: @tls_ca_cert_path, + cert_path: @tls_client_cert_path, + private_key_path: @tls_client_private_key_path, + private_key_passphrase: @tls_client_private_key_passphrase, + cert_thumbprint: @tls_cert_thumbprint, + cert_logical_store_name: @tls_cert_logical_store_name, + cert_use_enterprise_store: @tls_cert_use_enterprise_store, + + # Enabling SO_LINGER causes tcp port exhaustion on Windows. + # This is because dynamic ports are only 16384 (from 49152 to 65535) and + # expiring SO_LINGER enabled ports should wait 4 minutes + # where set by TcpTimeDelay. Its default value is 4 minutes. + # So, we should disable SO_LINGER on Windows to prevent flood of waiting ports. + linger_timeout: Fluent.windows? ? nil : @send_timeout, + send_timeout: @send_timeout, + recv_timeout: @ack_response_timeout, + connect_timeout: @connect_timeout, + &block + ) + when :tcp + socket_create_tcp( + host, port, + linger_timeout: @send_timeout, + send_timeout: @send_timeout, + recv_timeout: @ack_response_timeout, + connect_timeout: @connect_timeout, + &block + ) + else + raise "BUG: unknown transport protocol #{@transport}" end end - def send_data(node, tag, chunk) - sock = connect(node) - begin - opt = [1, @send_timeout.to_i].pack('I!I!') # { int l_onoff; int l_linger; } - sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_LINGER, opt) - - opt = [@send_timeout.to_i, 0].pack('L!L!') # struct timeval - sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDTIMEO, opt) - - # beginArray(2) - sock.write forward_header - - # writeRaw(tag) - sock.write tag.to_msgpack # tag - - # beginRaw(size) - sz = chunk.size - #if sz < 32 - # # FixRaw - # sock.write [0xa0 | sz].pack('C') - #elsif sz < 65536 - # # raw 16 - # sock.write [0xda, sz].pack('Cn') - #else - # raw 32 - sock.write [0xdb, sz].pack('CN') - #end - - # writeRawBody(packed_es) - chunk.write_to(sock) - - if @extend_internal_protocol - option = {} - option['chunk'] = Base64.encode64(chunk.unique_id) if @require_ack_response - sock.write option.to_msgpack - - if @require_ack_response && @ack_response_timeout > 0 - # Waiting for a response here results in a decrease of throughput because a chunk queue is locked. - # To avoid a decrease of troughput, it is necessary to prepare a list of chunks that wait for responses - # and process them asynchronously. - if IO.select([sock], nil, nil, @ack_response_timeout) - raw_data = sock.recv(1024) - - # When connection is closed by remote host, socket is ready to read and #recv returns an empty string that means EOF. - # If this happens we assume the data wasn't delivered and retry it. - if raw_data.empty? - @log.warn "node #{node.host}:#{node.port} closed the connection. regard it as unavailable." - node.disable! - raise ForwardOutputConnectionClosedError, "node #{node.host}:#{node.port} closed connection" - else - # Serialization type of the response is same as sent data. - res = MessagePack.unpack(raw_data) - - if res['ack'] != option['chunk'] - # Some errors may have occured when ack and chunk id is different, so send the chunk again. - raise ForwardOutputResponseError, "ack in response and chunk id in sent data are different" - end - end - - else - # IO.select returns nil on timeout. - # There are 2 types of cases when no response has been received: - # (1) the node does not support sending responses - # (2) the node does support sending response but responses have not arrived for some reasons. - @log.warn "no response from #{node.host}:#{node.port}. regard it as unavailable." - node.disable! - raise ForwardOutputACKTimeoutError, "node #{node.host}:#{node.port} does not return ACK" - end - end + def statistics + stats = super + services = discovery_manager.services + healthy_nodes_count = 0 + registed_nodes_count = services.size + services.each do |s| + if s.available? + healthy_nodes_count += 1 end - - node.heartbeat(false) - return res # for test - ensure - sock.close end + + stats.merge( + 'healthy_nodes_count' => healthy_nodes_count, + 'registered_nodes_count' => registed_nodes_count, + ) end - def connect(node) - # TODO unix socket? - TCPSocket.new(node.resolved_host, node.port) + # MessagePack FixArray length is 3 + FORWARD_HEADER = [0x93].pack('C').freeze + def forward_header + FORWARD_HEADER end - class HeartbeatRequestTimer < Coolio::TimerWatcher - def initialize(interval, callback) - super(interval, true) - @callback = callback - end + private - def on_timer - @callback.call - rescue - # TODO log? + def build_node(server) + name = server.name || "#{server.host}:#{server.port}" + log.info "adding forwarding server '#{name}'", host: server.host, port: server.port, weight: server.weight, plugin_id: plugin_id + + failure = Fluent::Plugin::ForwardOutput::FailureDetector.new(@heartbeat_interval, @hard_timeout, Time.now.to_i.to_f) + if @heartbeat_type == :none + NoneHeartbeatNode.new(self, server, failure: failure, connection_manager: @connection_manager, ack_handler: @ack_handler) + else + Node.new(self, server, failure: failure, connection_manager: @connection_manager, ack_handler: @ack_handler) end end - def on_timer - return if @finished - @nodes.each {|n| - if n.tick - rebuild_weight_array - end + def on_heartbeat_timer + need_rebuild = false + discovery_manager.services.each do |n| begin - #log.trace "sending heartbeat #{n.host}:#{n.port} on #{@heartbeat_type}" - if @heartbeat_type == :tcp - send_heartbeat_tcp(n) - else - @usock.send "\0", 0, Socket.pack_sockaddr_in(n.port, n.resolved_host) - end - rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR, Errno::ECONNREFUSED - # TODO log - log.debug "failed to send heartbeat packet to #{n.host}:#{n.port}", error: $!.to_s + log.trace "sending heartbeat", host: n.host, port: n.port, heartbeat_type: @heartbeat_type + n.usock = @usock if @usock + need_rebuild = n.send_heartbeat || need_rebuild + rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR, Errno::ECONNREFUSED, Errno::ETIMEDOUT => e + log.debug "failed to send heartbeat packet", host: n.host, port: n.port, heartbeat_type: @heartbeat_type, error: e + rescue => e + log.debug "unexpected error happen during heartbeat", host: n.host, port: n.port, heartbeat_type: @heartbeat_type, error: e end - } - end - class HeartbeatHandler < Coolio::IO - def initialize(io, callback) - super(io) - @io = io - @callback = callback + need_rebuild = n.tick || need_rebuild end - def on_readable - begin - msg, addr = @io.recvfrom(1024) - rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR - return - end - host = addr[3] - port = addr[1] - sockaddr = Socket.pack_sockaddr_in(port, host) - @callback.call(sockaddr, msg) - rescue - # TODO log? + if need_rebuild + discovery_manager.rebalance end end - def on_heartbeat(sockaddr, msg) - port, host = Socket.unpack_sockaddr_in(sockaddr) - if node = @nodes.find {|n| n.sockaddr == sockaddr } - #log.trace "heartbeat from '#{node.name}'", :host=>node.host, :port=>node.port + def on_udp_heatbeat_response_recv(data, sock) + sockaddr = Socket.pack_sockaddr_in(sock.remote_port, sock.remote_host) + if node = discovery_manager.services.find { |n| n.sockaddr == sockaddr } + # log.trace "heartbeat arrived", name: node.name, host: node.host, port: node.port if node.heartbeat - rebuild_weight_array + discovery_manager.rebalance end + else + log.warn("Unknown heartbeat response received from #{sock.remote_host}:#{sock.remote_port}. It may service out") end end - NodeConfig2 = Struct.new("NodeConfig2", :name, :host, :port, :weight, :standby, :failure, - :phi_threshold, :recover_sample_size, :expire_dns_cache, :phi_failure_detector, :dns_round_robin, :skip_network_error) + def on_purge_obsolete_socks + @connection_manager.purge_obsolete_socks + end + + def ack_select_interval + if @delayed_commit_timeout > 3 + 1 + else + @delayed_commit_timeout / 3.0 + end + end + + def ack_reader + select_interval = ack_select_interval + + while thread_current_running? + ack_check(select_interval) + end + end + + def ack_check(select_interval) + @ack_handler.collect_response(select_interval) do |chunk_id, node, sock, result| + @connection_manager.close(sock) + + case result + when AckHandler::Result::SUCCESS + commit_write(chunk_id) + when AckHandler::Result::FAILED + node.disable! + rollback_write(chunk_id, update_retry: false) + when AckHandler::Result::CHUNKID_UNMATCHED + rollback_write(chunk_id, update_retry: false) + else + log.warn("BUG: invalid status #{result} #{chunk_id}") + + if chunk_id + rollback_write(chunk_id, update_retry: false) + end + end + end + end class Node - def initialize(log, conf) - @log = log - @conf = conf - @name = @conf.name - @host = @conf.host - @port = @conf.port - @weight = @conf.weight - @failure = @conf.failure + extend Forwardable + def_delegators :@server, :discovery_id, :host, :port, :name, :weight, :standby + + # @param connection_manager [Fluent::Plugin::ForwardOutput::ConnectionManager] + # @param ack_handler [Fluent::Plugin::ForwardOutput::AckHandler] + def initialize(sender, server, failure:, connection_manager:, ack_handler:) + @sender = sender + @log = sender.log + @compress = sender.compress + @server = server + + @name = server.name + @host = server.host + @port = server.port + @weight = server.weight + @standby = server.standby + @failure = failure @available = true + # @hostname is used for certificate verification & TLS SNI + host_is_hostname = !(IPAddr.new(@host) rescue false) + @hostname = case + when host_is_hostname then @host + when @name then @name + else nil + end + + @usock = nil + + @handshake = Fluent::Plugin::ForwardOutput::HandshakeProtocol.new( + log: @log, + hostname: sender.security && sender.security.self_hostname, + shared_key: server.shared_key || (sender.security && sender.security.shared_key) || '', + password: server.password || '', + username: server.username || '', + ) + + @unpacker = Fluent::MessagePackFactory.msgpack_unpacker + @resolved_host = nil @resolved_time = 0 - begin - resolved_host # check dns - rescue => e - if @conf.skip_network_error - log.warn "#{@name} got network error during setup. Resolve host later", :error => e, :error_class => e.class - else - raise - end - end - end + @resolved_once = false + + @connection_manager = connection_manager + @ack_handler = ack_handler + end + + attr_accessor :usock - attr_reader :conf - attr_reader :name, :host, :port, :weight - attr_reader :sockaddr # used by on_heartbeat - attr_reader :failure, :available # for test + attr_reader :state + attr_reader :sockaddr # used by on_udp_heatbeat_response_recv + attr_reader :failure # for test + + def validate_host_resolution! + resolved_host + end def available? @available @@ -488,41 +612,158 @@ def disable! end def standby? - @conf.standby + @standby + end + + def verify_connection + connect do |sock, ri| + ensure_established_connection(sock, ri) + end + end + + def establish_connection(sock, ri) + while ri.state != :established + begin + # TODO: On Ruby 2.2 or earlier, read_nonblock doesn't work expectedly. + # We need rewrite around here using new socket/server plugin helper. + buf = sock.read_nonblock(@sender.read_length) + if buf.empty? + sleep @sender.read_interval + next + end + @unpacker.feed_each(buf) do |data| + if @handshake.invoke(sock, ri, data) == :established + @log.debug "connection established", host: @host, port: @port + end + end + rescue IO::WaitReadable + # If the exception is Errno::EWOULDBLOCK or Errno::EAGAIN, it is extended by IO::WaitReadable. + # So IO::WaitReadable can be used to rescue the exceptions for retrying read_nonblock. + # https//docs.ruby-lang.org/en/2.3.0/IO.html#method-i-read_nonblock + sleep @sender.read_interval unless ri.state == :established + rescue SystemCallError => e + @log.warn "disconnected by error", host: @host, port: @port, error: e + disable! + break + rescue EOFError + @log.warn "disconnected", host: @host, port: @port + disable! + break + rescue HeloError => e + @log.warn "received invalid helo message from #{@name}" + disable! + break + rescue PingpongError => e + @log.warn "connection refused to #{@name || @host}: #{e.message}" + disable! + break + end + end + end + + def send_data_actual(sock, tag, chunk) + option = { 'size' => chunk.size, 'compressed' => @compress } + option['chunk'] = Base64.encode64(chunk.unique_id) if @ack_handler + + # https://github.com/fluent/fluentd/wiki/Forward-Protocol-Specification-v1#packedforward-mode + # out_forward always uses str32 type for entries. + # str16 can store only 64kbytes, and it should be much smaller than buffer chunk size. + + tag = tag.dup.force_encoding(Encoding::UTF_8) + + sock.write @sender.forward_header # array, size=3 + sock.write tag.to_msgpack # 1. tag: String (str) + chunk.open(compressed: @compress) do |chunk_io| + entries = [0xdb, chunk_io.size].pack('CN') + sock.write entries.force_encoding(Encoding::UTF_8) # 2. entries: String (str32) + IO.copy_stream(chunk_io, sock) # writeRawBody(packed_es) + end + sock.write option.to_msgpack # 3. option: Hash(map) + + # TODO: use bin32 for non-utf8 content(entries) when old msgpack-ruby (0.5.x or earlier) not supported + end + + def send_data(tag, chunk) + ack = @ack_handler && @ack_handler.create_ack(chunk.unique_id, self) + connect(nil, ack: ack) do |sock, ri| + ensure_established_connection(sock, ri) + send_data_actual(sock, tag, chunk) + end + + heartbeat(false) + nil + end + + # FORWARD_TCP_HEARTBEAT_DATA = FORWARD_HEADER + ''.to_msgpack + [].to_msgpack + # + # @return [Boolean] return true if it needs to rebuild nodes + def send_heartbeat + begin + dest_addr = resolved_host + @resolved_once = true + rescue ::SocketError => e + if !@resolved_once && @sender.ignore_network_errors_at_startup + @log.warn "failed to resolve node name in heartbeating", server: @name || @host, error: e + return false + end + raise + end + + case @sender.heartbeat_type + when :transport + connect(dest_addr) do |sock, ri| + ensure_established_connection(sock, ri) + + ## don't send any data to not cause a compatibility problem + # sock.write FORWARD_TCP_HEARTBEAT_DATA + + # successful tcp connection establishment is considered as valid heartbeat. + # When heartbeat is succeeded after detached, return true. It rebuilds weight array. + heartbeat(true) + end + when :udp + @usock.send "\0", 0, Socket.pack_sockaddr_in(@port, dest_addr) + # response is going to receive at on_udp_heatbeat_response_recv + false + when :none # :none doesn't use this class + raise "BUG: heartbeat_type none must not use Node" + else + raise "BUG: unknown heartbeat_type '#{@sender.heartbeat_type}'" + end end def resolved_host - case @conf.expire_dns_cache + case @sender.expire_dns_cache when 0 # cache is disabled - return resolve_dns! + resolve_dns! when nil # persistent cache - return @resolved_host ||= resolve_dns! + @resolved_host ||= resolve_dns! else - now = Engine.now + now = Fluent::EventTime.now rh = @resolved_host - if !rh || now - @resolved_time >= @conf.expire_dns_cache + if !rh || now - @resolved_time >= @sender.expire_dns_cache rh = @resolved_host = resolve_dns! @resolved_time = now end - return rh + rh end end def resolve_dns! addrinfo_list = Socket.getaddrinfo(@host, @port, nil, Socket::SOCK_STREAM) - addrinfo = @conf.dns_round_robin ? addrinfo_list.sample : addrinfo_list.first - @sockaddr = Socket.pack_sockaddr_in(addrinfo[1], addrinfo[3]) # used by on_heartbeat + addrinfo = @sender.dns_round_robin ? addrinfo_list.sample : addrinfo_list.first + @sockaddr = Socket.pack_sockaddr_in(addrinfo[1], addrinfo[3]) # used by on_udp_heatbeat_response_recv addrinfo[3] end private :resolve_dns! def tick now = Time.now.to_f - if !@available + unless available? if @failure.hard_timeout?(now) @failure.clear end @@ -531,41 +772,51 @@ def tick if @failure.hard_timeout?(now) @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, hard_timeout: true - @available = false + disable! @resolved_host = nil # expire cached host @failure.clear return true end - if @conf.phi_failure_detector + if @sender.phi_failure_detector phi = @failure.phi(now) - #$log.trace "phi '#{@name}'", :host=>@host, :port=>@port, :phi=>phi - if phi > @conf.phi_threshold - @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, phi: phi - @available = false + if phi > @sender.phi_threshold + @log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, phi: phi, phi_threshold: @sender.phi_threshold + disable! @resolved_host = nil # expire cached host @failure.clear return true end end - return false + false end def heartbeat(detect=true) now = Time.now.to_f @failure.add(now) - #@log.trace "heartbeat from '#{@name}'", :host=>@host, :port=>@port, :available=>@available, :sample_size=>@failure.sample_size - if detect && !@available && @failure.sample_size > @conf.recover_sample_size + if detect && !available? && @failure.sample_size > @sender.recover_sample_size @available = true @log.warn "recovered forwarding server '#{@name}'", host: @host, port: @port - return true + true else - return nil + nil end end - def to_msgpack(out = '') - [@host, @port, @weight, @available].to_msgpack(out) + private + + def ensure_established_connection(sock, request_info) + if request_info.state != :established + establish_connection(sock, request_info) + + if request_info.state != :established + raise ConnectionClosedError, "failed to establish connection with node #{@name}" + end + end + end + + def connect(host = nil, ack: false, &block) + @connection_manager.connect(host: host || resolved_host, port: port, hostname: @hostname, ack: ack, &block) end end @@ -583,96 +834,5 @@ def heartbeat(detect=true) true end end - - class FailureDetector - PHI_FACTOR = 1.0 / Math.log(10.0) - SAMPLE_SIZE = 1000 - - def initialize(heartbeat_interval, hard_timeout, init_last) - @heartbeat_interval = heartbeat_interval - @last = init_last - @hard_timeout = hard_timeout - - # microsec - @init_gap = (heartbeat_interval * 1e6).to_i - @window = [@init_gap] - end - - def hard_timeout?(now) - now - @last > @hard_timeout - end - - def add(now) - if @window.empty? - @window << @init_gap - @last = now - else - gap = now - @last - @window << (gap * 1e6).to_i - @window.shift if @window.length > SAMPLE_SIZE - @last = now - end - end - - def phi(now) - size = @window.size - return 0.0 if size == 0 - - # Calculate weighted moving average - mean_usec = 0 - fact = 0 - @window.each_with_index {|gap,i| - mean_usec += gap * (1+i) - fact += (1+i) - } - mean_usec = mean_usec / fact - - # Normalize arrive intervals into 1sec - mean = (mean_usec.to_f / 1e6) - @heartbeat_interval + 1 - - # Calculate phi of the phi accrual failure detector - t = now - @last - @heartbeat_interval + 1 - phi = PHI_FACTOR * t / mean - - return phi - end - - def sample_size - @window.size - end - - def clear - @window.clear - @last = 0 - end - end - - ## TODO - #class RPC - # def initialize(this) - # @this = this - # end - # - # def list_nodes - # @this.nodes - # end - # - # def list_fault_nodes - # list_nodes.select {|n| !n.available? } - # end - # - # def list_available_nodes - # list_nodes.select {|n| n.available? } - # end - # - # def add_node(name, host, port, weight) - # end - # - # def recover_node(host, port) - # end - # - # def remove_node(host, port) - # end - #end end end diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 6238eb51a..8e80fb753 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -1,11 +1,12 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -module Fluent - class OutputMDM < BufferedOutput - config_param :retry_mdm_post_wait_minutes, :integer +require 'fluent/plugin/output' - Plugin.register_output("out_mdm", self) +module Fluent::Plugin + class OutputMDM < Output + config_param :retry_mdm_post_wait_minutes, :integer + Fluent::Plugin.register_output("mdm", self) def initialize super @@ -57,8 +58,6 @@ def initialize end def configure(conf) - s = conf.add_element("secondary") - s["type"] = ChunkErrorHandler::SecondaryName super end @@ -204,7 +203,7 @@ def get_access_token end def write_status_file(success, message) - fn = "/var/opt/microsoft/omsagent/log/MDMIngestion.status" + fn = "/var/opt/microsoft/docker-cimprov/log/MDMIngestion.status" status = '{ "operation": "MDMIngestion", "success": "%s", "message": "%s" }' % [success, message] begin File.open(fn, "w") { |file| file.write(status) } @@ -270,6 +269,7 @@ def write(chunk) flush_mdm_exception_telemetry if (!@first_post_attempt_made || (Time.now > @last_post_attempt_time + retry_mdm_post_wait_minutes * 60)) && @can_send_data_to_mdm post_body = [] + chunk.extend Fluent::ChunkMessagePackEventStreamer chunk.msgpack_each { |(tag, record)| post_body.push(record.to_json) } @@ -320,7 +320,7 @@ def send_to_mdm(post_body) ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) @last_telemetry_sent_time = Time.now end - rescue Net::HTTPServerException => e + rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException if !response.nil? && !response.body.nil? #body will have actual error @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" else @@ -334,7 +334,7 @@ def send_to_mdm(post_body) # Not raising exception, as that will cause retries to happen elsif !response.code.empty? && response.code.start_with?("4") # Log 400 errors and continue - @log.info "Non-retryable HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" + @log.info "Non-retryable HTTPClientException when POSTing Metrics to MDM #{e} Response: #{response}" else # raise if the response code is non-400 @log.info "HTTPServerException when POSTing Metrics to MDM #{e} Response: #{response}" @@ -352,72 +352,5 @@ def send_to_mdm(post_body) raise e end end - - private - - class ChunkErrorHandler - include Configurable - include PluginId - include PluginLoggerMixin - - SecondaryName = "__ChunkErrorHandler__" - - Plugin.register_output(SecondaryName, self) - - def initialize - @router = nil - end - - def secondary_init(primary) - @error_handlers = create_error_handlers @router - end - - def start - # NOP - end - - def shutdown - # NOP - end - - def router=(r) - @router = r - end - - def write(chunk) - chunk.msgpack_each { |(tag, record)| - @error_handlers[tag].emit(record) - } - end - - private - - def create_error_handlers(router) - nop_handler = NopErrorHandler.new - Hash.new() { |hash, tag| - etag = OMS::Common.create_error_tag tag - hash[tag] = router.match?(etag) ? - ErrorHandler.new(router, etag) : - nop_handler - } - end - - class ErrorHandler - def initialize(router, etag) - @router = router - @etag = etag - end - - def emit(record) - @router.emit(@etag, Fluent::Engine.now, record) - end - end - - class NopErrorHandler - def emit(record) - # NOP - end - end - end end # class OutputMDM end # module Fluent diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index d9cb71bd4..c24a91a87 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -279,16 +279,16 @@ def process_pod_inventory_record(record) begin records = [] - podUid = record["DataItems"][0]["PodUid"] + podUid = record["PodUid"] if @pod_uids.key?(podUid) return end @pod_uids[podUid] = true - podPhaseDimValue = record["DataItems"][0]["PodStatus"] - podNamespaceDimValue = record["DataItems"][0]["Namespace"] - podControllerNameDimValue = record["DataItems"][0]["ControllerName"] - podNodeDimValue = record["DataItems"][0]["Computer"] + podPhaseDimValue = record["PodStatus"] + podNamespaceDimValue = record["Namespace"] + podControllerNameDimValue = record["ControllerName"] + podNodeDimValue = record["Computer"] if podControllerNameDimValue.nil? || podControllerNameDimValue.empty? podControllerNameDimValue = "No Controller" diff --git a/source/plugins/utils/oms_common.rb b/source/plugins/utils/oms_common.rb new file mode 100644 index 000000000..c10cb8638 --- /dev/null +++ b/source/plugins/utils/oms_common.rb @@ -0,0 +1,143 @@ +module OMS + + MSDockerCImprovHostnameFilePath = '/var/opt/microsoft/docker-cimprov/state/containerhostname' + IPV6_REGEX = '\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}:\h{4}' + IPV4_Approximate_REGEX = '\d+\.\d+\.\d+\.\d+' + + class RetryRequestException < Exception + # Throw this exception to tell the fluentd engine to retry and + # inform the output plugin that it is indeed retryable + end + + class Common + require 'socket' + require_relative 'omslog' + + @@Hostname = nil + @@HostnameFilePath = MSDockerCImprovHostnameFilePath + + + class << self + + # Internal methods + # (left public for easy testing, though protected may be better later) + + def clean_hostname_string(hnBuffer) + return "" if hnBuffer.nil? # So give the rest of the program a string to deal with. + hostname_buffer = hnBuffer.strip + return hostname_buffer + end + + def has_designated_hostnamefile? + return false if @@HostnameFilePath.nil? + return false unless @@HostnameFilePath =~ /\w/ + return false unless File.exist?(@@HostnameFilePath) + return true + end + + def is_dot_separated_string?(hnBuffer) + return true if /[^.]+\.[^.]+/ =~ hnBuffer + return false + end + + def is_hostname_compliant?(hnBuffer) + # RFC 2181: + # Size limit is 1 to 63 octets, so probably bytesize is appropriate method. + return false if hnBuffer.nil? + return false if /\./ =~ hnBuffer # Hostname by definition may not contain a dot. + return false if /:/ =~ hnBuffer # Hostname by definition may not contain a colon. + return false unless 1 <= hnBuffer.bytesize && hnBuffer.bytesize <= 63 + return true + end + + def is_like_ipv4_string?(hnBuffer) + return false unless /\A#{IPV4_Approximate_REGEX}\z/ =~ hnBuffer + qwa = hnBuffer.split('.') + return false unless qwa.length == 4 + return false if qwa[0].to_i == 0 + qwa.each do |quadwordstring| + bi = quadwordstring.to_i + # This may need more detail if 255 octets are sometimes allowed, but I don't think so. + return false unless 0 <= bi and bi < 255 + end + return true + end + + def is_like_ipv6_string?(hnBuffer) + return true if /\A#{IPV6_REGEX}\z/ =~ hnBuffer + return false + end + + def look_for_socket_class_host_address + hostname_buffer = nil + + begin + hostname_buffer = Socket.gethostname + rescue => error + OMS::Log.error_once("Unable to get the Host Name using socket facility: #{error}") + return + end + @@Hostname = clean_hostname_string(hostname_buffer) + + return # Thwart accidental return to force correct use. + end + + def look_in_designated_hostnamefile + # Issue: + # When omsagent runs inside a container, gethostname returns the hostname of the container (random name) + # not the actual machine hostname. + # One way to solve this problem is to set the container hostname same as machine name, but this is not + # possible when host-machine is a private VM inside a cluster. + # Solution: + # Share/mount ‘/etc/hostname’ as '/var/opt/microsoft/omsagent/state/containername' with container and + # omsagent will read hostname from shared file. + hostname_buffer = nil + + unless File.readable?(@@HostnameFilePath) + OMS::Log.warn_once("File '#{@@HostnameFilePath}' exists but is not readable.") + return + end + + begin + hostname_buffer = File.read(@@HostnameFilePath) + rescue => error + OMS::Log.warn_once("Unable to read the hostname from #{@@HostnameFilePath}: #{error}") + end + @@Hostname = clean_hostname_string(hostname_buffer) + return # Thwart accidental return to force correct use. + end + + def validate_hostname_equivalent(hnBuffer) + # RFC 1123 and 2181 + # Note that for now we are limiting the earlier maximum of 63 for fqdn labels and thus + # hostnames UNTIL we are assured azure will allow 255, as specified in RFC 1123, or + # we are otherwise instructed. + rfcl = "RFCs 1123, 2181 with hostname range of {1,63} octets for non-root item." + return if is_hostname_compliant?(hnBuffer) + return if is_like_ipv4_string?(hnBuffer) + return if is_like_ipv6_string?(hnBuffer) + msg = "Hostname '#{hnBuffer}' not compliant (#{rfcl}). Not IP Address Either." + OMS::Log.warn_once(msg) + raise NameError, msg + end + + # End of Internal methods + + def get_hostname(ignoreOldValue = false) + if not is_hostname_compliant?(@@Hostname) or ignoreOldValue then + + look_in_designated_hostnamefile if has_designated_hostnamefile? + + look_for_socket_class_host_address unless is_hostname_compliant?(@@Hostname) + end + + begin + validate_hostname_equivalent(@@Hostname) + rescue => error + OMS::Log.warn_once("Hostname '#{@@Hostname}' found, but did NOT validate as compliant. #{error}. Using anyway.") + end + return @@Hostname + end + end # Class methods + end # class Common +end # module OMS diff --git a/source/plugins/utils/omslog.rb b/source/plugins/utils/omslog.rb new file mode 100644 index 000000000..b65bf947c --- /dev/null +++ b/source/plugins/utils/omslog.rb @@ -0,0 +1,50 @@ +module OMS + class Log + require 'set' + require 'digest' + + @@error_proc = Proc.new {|message| $log.error message } + @@warn_proc = Proc.new {|message| $log.warn message } + @@info_proc = Proc.new {|message| $log.info message } + @@debug_proc = Proc.new {|message| $log.debug message } + + @@logged_hashes = Set.new + + class << self + def error_once(message, tag=nil) + log_once(@@error_proc, @@debug_proc, message, tag) + end + + def warn_once(message, tag=nil) + log_once(@@warn_proc, @@debug_proc, message, tag) + end + + def info_once(message, tag=nil) + log_once(@@info_proc, @@debug_proc, message, tag) + end + + def log_once(first_loglevel_proc, next_loglevel_proc, message, tag=nil) + # Will log a message once with the first procedure and subsequently with the second + # This allows repeated messages to be ignored by having the second logging function at a lower log level + # An optional tag can be used as the message key + + if tag == nil + tag = message + end + + md5_digest = Digest::MD5.new + tag_hash = md5_digest.update(tag).base64digest + res = @@logged_hashes.add?(tag_hash) + + if res == nil + # The hash was already in the set + next_loglevel_proc.call(message) + else + # First time we see this hash + first_loglevel_proc.call(message) + end + end + end # Class methods + + end # Class Log +end # Module OMS From 959b455d5ab873b6fa5ed7445bd61dc847ec2c08 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 26 May 2021 17:41:00 -0700 Subject: [PATCH 106/194] Send perf metrics to MDM from windows daemonset (#568) --- .../scripts/tomlparser-mdm-metrics-config.rb | 59 ++++++++++---- .../installer/datafiles/base_container.data | 2 +- build/windows/Makefile.ps1 | 10 +++ build/windows/installer/conf/fluent.conf | 32 ++++++++ kubernetes/omsagent.yaml | 9 +++ kubernetes/windows/Dockerfile | 6 +- kubernetes/windows/main.ps1 | 66 +++++++++++++++- .../ruby/ApplicationInsightsUtility.rb | 36 ++++++--- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 77 ++++++++++++------- source/plugins/ruby/KubernetesApiClient.rb | 7 +- source/plugins/ruby/MdmMetricsGenerator.rb | 7 +- .../plugins/ruby/arc_k8s_cluster_identity.rb | 29 ++++--- source/plugins/ruby/filter_cadvisor2mdm.rb | 29 ++++--- source/plugins/ruby/in_cadvisor_perf.rb | 29 ++++--- source/plugins/ruby/in_win_cadvisor_perf.rb | 11 ++- source/plugins/ruby/kubelet_utils.rb | 7 +- 16 files changed, 311 insertions(+), 105 deletions(-) rename build/{linux => common}/installer/scripts/tomlparser-mdm-metrics-config.rb (75%) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/common/installer/scripts/tomlparser-mdm-metrics-config.rb similarity index 75% rename from build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb rename to build/common/installer/scripts/tomlparser-mdm-metrics-config.rb index dcf179bf2..b6a4419cf 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/common/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -1,9 +1,16 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require_relative "tomlrb" -require_relative "ConfigParseErrorLogger" +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + require_relative "/etc/fluent/plugin/constants" +require_relative "ConfigParseErrorLogger" @configMapMountPath = "/etc/config/settings/alertable-metrics-configuration-settings" @configVersion = "" @@ -124,6 +131,10 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end +def get_command_windows(env_variable_name, env_variable_value) + return "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Process\")" + "\n" + "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Machine\")" + "\n" +end + @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] puts "****************Start MDM Metrics Config Processing********************" if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it @@ -137,19 +148,37 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end -# Write the settings to file, so that they can be set as environment variables -file = File.open("config_mdm_metrics_env_var", "w") +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + # Write the settings to file, so that they can be set as environment variables in windows container + file = File.open("setmdmenv.ps1", "w") -if !file.nil? - file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") - file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") - file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") - file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") - file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") - # Close file after writing all MDM setting environment variables - file.close - puts "****************End MDM Metrics Config Processing********************" + if !file.nil? + commands = get_command_windows("AZMON_ALERT_CONTAINER_CPU_THRESHOLD", @percentageCpuUsageThreshold) + file.write(commands) + commands = get_command_windows("AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD", @percentageMemoryWorkingSetThreshold) + file.write(commands) + # Close file after writing all environment variables + file.close + puts "****************End MDM Metrics Config Processing********************" + else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End MDM Metrics Config Processing********************" + end else - puts "Exception while opening file for writing MDM metric config environment variables" - puts "****************End MDM Metrics Config Processing********************" + # Write the settings to file, so that they can be set as environment variables in linux container + file = File.open("config_mdm_metrics_env_var", "w") + + if !file.nil? + file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") + file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") + file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") + file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") + file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") + # Close file after writing all MDM setting environment variables + file.close + puts "****************End MDM Metrics Config Processing********************" + else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End MDM Metrics Config Processing********************" + end end diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index b9f889dba..de8ccbba0 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -42,7 +42,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/common/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root diff --git a/build/windows/Makefile.ps1 b/build/windows/Makefile.ps1 index 2d49330ea..737abc92a 100644 --- a/build/windows/Makefile.ps1 +++ b/build/windows/Makefile.ps1 @@ -180,4 +180,14 @@ $exclude = @('*.cs','*.csproj') Copy-Item -Path $installerdir -Destination $publishdir -Recurse -Force -Exclude $exclude Write-Host("successfully copied installer files conf and scripts from :" + $installerdir + " to :" + $publishdir + " ") -ForegroundColor Green +$rubyplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\ruby" +Write-Host("copying ruby source files from :" + $rubyplugindir + " to :" + $publishdir + " ...") +Copy-Item -Path $rubyplugindir -Destination $publishdir -Recurse -Force +Write-Host("successfully copied ruby source files from :" + $rubyplugindir + " to :" + $publishdir + " ") -ForegroundColor Green + +$utilsplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\utils" +Write-Host("copying ruby util files from :" + $utilsplugindir + " to :" + $publishdir + " ...") +Copy-Item -Path $utilsplugindir -Destination $publishdir -Recurse -Force +Write-Host("successfully copied ruby util files from :" + $utilsplugindir + " to :" + $publishdir + " ") -ForegroundColor Green + Set-Location $currentdir \ No newline at end of file diff --git a/build/windows/installer/conf/fluent.conf b/build/windows/installer/conf/fluent.conf index d5eb475ca..741e5ce19 100644 --- a/build/windows/installer/conf/fluent.conf +++ b/build/windows/installer/conf/fluent.conf @@ -4,6 +4,13 @@ @log_level info + + @type cadvisor_perf + tag oms.api.cadvisorperf + run_interval 60 + @log_level debug + + @type tail path "#{ENV['AZMON_LOG_TAIL_PATH']}" @@ -29,6 +36,14 @@ @include fluent-docker-parser.conf +#custom_metrics_mdm filter plugin + + @type cadvisor2mdm + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_path /etc/omsagentwindows/filter_cadvisor2mdm.log + @log_level info + + @type grep @@ -46,6 +61,23 @@ + + @type mdm + @log_level debug + + @type file + path /etc/omsagentwindows/out_mdm_cdvisorperf.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + flush_interval 20s + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + retry_mdm_post_wait_minutes 30 + + @type forward send_timeout 60s diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index ab6bbea9c..4290e1d59 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -790,6 +790,9 @@ spec: fieldPath: status.hostIP - name: SIDECAR_SCRAPING_ENABLED value: "true" + # Update this with the user assigned msi client id for omsagent + - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + value: "" # Add this only for clouds that require cert bootstrapping - name: REQUIRES_CERT_BOOTSTRAP value: "true" @@ -812,6 +815,9 @@ spec: # - mountPath: C:\ca # name: ca-certs # readOnly: true + - mountPath: C:\etc\kubernetes\host + name: azure-json-path + readOnly: true livenessProbe: exec: command: @@ -843,6 +849,9 @@ spec: - name: docker-windows-kuberenetes-container-logs hostPath: path: C:\var + - name: azure-json-path + hostPath: + path: C:\k # Need to mount this only for airgapped clouds - Commenting this since it wont exist in non airgapped clouds #- name: ca-certs # hostPath: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index fefd089a8..5a5298d0b 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -20,7 +20,7 @@ RUN refreshenv \ && gem install cool.io -v 1.5.4 --platform ruby \ && gem install oj -v 3.3.10 \ && gem install json -v 2.2.0 \ -&& gem install fluentd -v 1.10.2 \ +&& gem install fluentd -v 1.12.2 \ && gem install win32-service -v 1.0.1 \ && gem install win32-ipc -v 0.7.0 \ && gem install win32-event -v 0.6.3 \ @@ -69,6 +69,10 @@ COPY ./omsagentwindows/installer/conf/telegraf.conf /etc/telegraf/ # copy keepcert alive ruby scripts COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/fluent/plugin/ +#Copy fluentd ruby plugins +COPY ./omsagentwindows/ruby/ /etc/fluent/plugin/ +COPY ./omsagentwindows/utils/*.rb /etc/fluent/plugin/ + ENV AGENT_VERSION ${IMAGE_TAG} ENV OS_TYPE "windows" ENV APPLICATIONINSIGHTS_AUTH "NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi" diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index baf95fca4..bc053b0d6 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -134,9 +134,6 @@ function Set-EnvironmentVariables { [System.Environment]::SetEnvironmentVariable("APPLICATIONINSIGHTS_ENDPOINT", $appInsightsEndpoint, "machine") Write-Host "Successfully set environment variable APPLICATIONINSIGHTS_ENDPOINT - $($appInsightsEndpoint) for target 'machine'..." } - else { - Write-Host "Failed to set environment variable APPLICATIONINSIGHTS_ENDPOINT for target 'machine' since it is either null or empty" - } # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) $aiKeyURl = [System.Environment]::GetEnvironmentVariable('APPLICATIONINSIGHTS_AUTH_URL') @@ -180,14 +177,71 @@ function Set-EnvironmentVariables { [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Process") [System.Environment]::SetEnvironmentVariable("TELEMETRY_APPLICATIONINSIGHTS_KEY", $aiKeyDecoded, "Machine") + # Setting environment variables required by the fluentd plugins + $aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID", "process") + if (![string]::IsNullOrEmpty($aksResourceId)) { + [System.Environment]::SetEnvironmentVariable("AKS_RESOURCE_ID", $aksResourceId, "machine") + Write-Host "Successfully set environment variable AKS_RESOURCE_ID - $($aksResourceId) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable AKS_RESOURCE_ID for target 'machine' since it is either null or empty" + } + + $aksRegion = [System.Environment]::GetEnvironmentVariable("AKS_REGION", "process") + if (![string]::IsNullOrEmpty($aksRegion)) { + [System.Environment]::SetEnvironmentVariable("AKS_REGION", $aksRegion, "machine") + Write-Host "Successfully set environment variable AKS_REGION - $($aksRegion) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable AKS_REGION for target 'machine' since it is either null or empty" + } + + $controllerType = [System.Environment]::GetEnvironmentVariable("CONTROLLER_TYPE", "process") + if (![string]::IsNullOrEmpty($controllerType)) { + [System.Environment]::SetEnvironmentVariable("CONTROLLER_TYPE", $controllerType, "machine") + Write-Host "Successfully set environment variable CONTROLLER_TYPE - $($controllerType) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable CONTROLLER_TYPE for target 'machine' since it is either null or empty" + } + + $osType = [System.Environment]::GetEnvironmentVariable("OS_TYPE", "process") + if (![string]::IsNullOrEmpty($osType)) { + [System.Environment]::SetEnvironmentVariable("OS_TYPE", $osType, "machine") + Write-Host "Successfully set environment variable OS_TYPE - $($osType) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable OS_TYPE for target 'machine' since it is either null or empty" + } + + $userMsi = [System.Environment]::GetEnvironmentVariable("USER_ASSIGNED_IDENTITY_CLIENT_ID", "process") + if (![string]::IsNullOrEmpty($userMsi)) { + [System.Environment]::SetEnvironmentVariable("USER_ASSIGNED_IDENTITY_CLIENT_ID", $userMsi, "machine") + Write-Host "Successfully set environment variable USER_ASSIGNED_IDENTITY_CLIENT_ID - $($userMsi) for target 'machine'..." + } + + $hostName = [System.Environment]::GetEnvironmentVariable("HOSTNAME", "process") + if (![string]::IsNullOrEmpty($hostName)) { + [System.Environment]::SetEnvironmentVariable("HOSTNAME", $hostName, "machine") + Write-Host "Successfully set environment variable HOSTNAME - $($hostName) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable HOSTNAME for target 'machine' since it is either null or empty" + } + # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb .\setenv.ps1 + + # run mdm config parser + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-mdm-metrics-config.rb + .\setmdmenv.ps1 } function Get-ContainerRuntime { # default container runtime and make default as containerd when containerd becomes default in AKS $containerRuntime = "docker" + $cAdvisorIsSecure = "false" $response = "" $NODE_IP = "" try { @@ -227,6 +281,7 @@ function Get-ContainerRuntime { if (![string]::IsNullOrEmpty($response) -and $response.StatusCode -eq 200) { Write-Host "API call to https://$($NODE_IP):10250/pods succeeded" $isPodsAPISuccess = $true + $cAdvisorIsSecure = "true" } } catch { @@ -234,6 +289,11 @@ function Get-ContainerRuntime { } } + # set IS_SECURE_CADVISOR_PORT env for debug and telemetry purpose + Write-Host "Setting IS_SECURE_CADVISOR_PORT environment variable as $($cAdvisorIsSecure)" + [System.Environment]::SetEnvironmentVariable("IS_SECURE_CADVISOR_PORT", $cAdvisorIsSecure, "Process") + [System.Environment]::SetEnvironmentVariable("IS_SECURE_CADVISOR_PORT", $cAdvisorIsSecure, "Machine") + if ($isPodsAPISuccess) { if (![string]::IsNullOrEmpty($response.Content)) { $podList = $response.Content | ConvertFrom-Json diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index 6ae567337..74d08c1e6 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -21,10 +21,15 @@ class ApplicationInsightsUtility @@EnvApplicationInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" @@EnvControllerType = "CONTROLLER_TYPE" @@EnvContainerRuntime = "CONTAINER_RUNTIME" - + @@isWindows = false + @@hostName = (OMS::Common.get_hostname) + @@os_type = ENV["OS_TYPE"] + if !@@os_type.nil? && !@@os_type.empty? && @@os_type.strip.casecmp("windows") == 0 + @@isWindows = true + @@hostName = ENV["HOSTNAME"] + end @@CustomProperties = {} @@Tc = nil - @@hostName = (OMS::Common.get_hostname) @@proxy = (ProxyUtils.getProxyConfiguration) def initialize @@ -133,16 +138,23 @@ def initializeUtility() end def getContainerRuntimeInfo() - containerRuntime = ENV[@@EnvContainerRuntime] - if !containerRuntime.nil? && !containerRuntime.empty? - # DockerVersion field holds either containerRuntime for non-docker or Dockerversion if its docker - @@CustomProperties["DockerVersion"] = containerRuntime - if containerRuntime.casecmp("docker") == 0 - dockerInfo = DockerApiClient.dockerInfo - if (!dockerInfo.nil? && !dockerInfo.empty?) - @@CustomProperties["DockerVersion"] = dockerInfo["Version"] + begin + # Not doing this for windows since docker is being deprecated soon and we dont want to bring in the socket dependency. + if !@@isWindows.nil? && @@isWindows == false + containerRuntime = ENV[@@EnvContainerRuntime] + if !containerRuntime.nil? && !containerRuntime.empty? + # DockerVersion field holds either containerRuntime for non-docker or Dockerversion if its docker + @@CustomProperties["DockerVersion"] = containerRuntime + if containerRuntime.casecmp("docker") == 0 + dockerInfo = DockerApiClient.dockerInfo + if (!dockerInfo.nil? && !dockerInfo.empty?) + @@CustomProperties["DockerVersion"] = dockerInfo["Version"] + end + end end end + rescue => errorStr + $log.warn("Exception in AppInsightsUtility: getContainerRuntimeInfo - error: #{errorStr}") end end @@ -262,7 +274,7 @@ def sendMetricTelemetry(metricName, metricValue, properties) end def getWorkspaceId() - begin + begin workspaceId = ENV["WSID"] if workspaceId.nil? || workspaceId.empty? $log.warn("Exception in AppInsightsUtility: getWorkspaceId - WorkspaceID either nil or empty") @@ -274,7 +286,7 @@ def getWorkspaceId() end def getWorkspaceCloud() - begin + begin workspaceDomain = ENV["DOMAIN"] workspaceCloud = "AzureCloud" if workspaceDomain.casecmp("opinsights.azure.com") == 0 diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index f02459aef..10720752d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -38,7 +38,12 @@ class CAdvisorMetricsAPIClient @npmIntegrationBasic = ENV["TELEMETRY_NPM_INTEGRATION_METRICS_BASIC"] @npmIntegrationAdvanced = ENV["TELEMETRY_NPM_INTEGRATION_METRICS_ADVANCED"] - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @LogPath = "/etc/omsagentwindows/kubernetes_perf_log.txt" + else + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + end @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil # @@rxBytesTimeLast = nil @@ -142,39 +147,54 @@ def getMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) operatingSystem = "Linux" end if !metricInfo.nil? - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime, operatingSystem)) - metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch", metricTime)) - - if operatingSystem == "Linux" - metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", Constants::CPU_USAGE_NANO_CORES, metricTime)) - metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime, operatingSystem)) - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime)) - elsif operatingSystem == "Windows" + # Checking if we are in windows daemonset and sending only few metrics that are needed for MDM + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + # Container metrics + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime, operatingSystem)) containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, metricTime) if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? metricDataItems.concat(containerCpuUsageNanoSecondsRate) end - end + # Node metrics + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, operatingSystem, metricTime) + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime)) + else + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime, operatingSystem)) + metricDataItems.concat(getContainerStartTimeMetricItems(metricInfo, hostName, "restartTimeEpoch", metricTime)) + + if operatingSystem == "Linux" + metricDataItems.concat(getContainerCpuMetricItems(metricInfo, hostName, "usageNanoCores", Constants::CPU_USAGE_NANO_CORES, metricTime)) + metricDataItems.concat(getContainerMemoryMetricItems(metricInfo, hostName, "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime, operatingSystem)) + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "rssBytes", Constants::MEMORY_RSS_BYTES, metricTime)) + elsif operatingSystem == "Windows" + containerCpuUsageNanoSecondsRate = getContainerCpuMetricItemRate(metricInfo, hostName, "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, metricTime) + if containerCpuUsageNanoSecondsRate && !containerCpuUsageNanoSecondsRate.empty? && !containerCpuUsageNanoSecondsRate.nil? + metricDataItems.concat(containerCpuUsageNanoSecondsRate) + end + end - cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, operatingSystem, metricTime) - if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? - metricDataItems.push(cpuUsageNanoSecondsRate) + cpuUsageNanoSecondsRate = getNodeMetricItemRate(metricInfo, hostName, "cpu", "usageCoreNanoSeconds", Constants::CPU_USAGE_NANO_CORES, operatingSystem, metricTime) + if cpuUsageNanoSecondsRate && !cpuUsageNanoSecondsRate.empty? && !cpuUsageNanoSecondsRate.nil? + metricDataItems.push(cpuUsageNanoSecondsRate) + end + metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime)) + + metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch", metricTime)) + # Disabling networkRxRate and networkTxRate since we dont use it as of now. + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) + #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) + # networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") + # if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? + # metricDataItems.push(networkRxRate) + # end + # networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") + # if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? + # metricDataItems.push(networkTxRate) + # end end - metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "memory", "workingSetBytes", Constants::MEMORY_WORKING_SET_BYTES, metricTime)) - - metricDataItems.push(getNodeLastRebootTimeMetric(metricInfo, hostName, "restartTimeEpoch", metricTime)) - - # Disabling networkRxRate and networkTxRate since we dont use it as of now. - #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "rxBytes", "networkRxBytes")) - #metricDataItems.push(getNodeMetricItem(metricInfo, hostName, "network", "txBytes", "networkTxBytes")) - # networkRxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "rxBytes", "networkRxBytesPerSec") - # if networkRxRate && !networkRxRate.empty? && !networkRxRate.nil? - # metricDataItems.push(networkRxRate) - # end - # networkTxRate = getNodeMetricItemRate(metricInfo, hostName, "network", "txBytes", "networkTxBytesPerSec") - # if networkTxRate && !networkTxRate.empty? && !networkTxRate.nil? - # metricDataItems.push(networkTxRate) - # end else @Log.warn("Couldn't get metric information for host: #{hostName}") end @@ -203,7 +223,6 @@ def getContainerCpuMetricItems(metricJSON, hostName, cpuMetricNameToCollect, met containerName = container["name"] metricValue = container["cpu"][cpuMetricNameToCollect] metricTime = metricPollTime #container["cpu"]["time"] - metricItem = {} metricItem["Timestamp"] = metricTime diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 3720bf6dc..4b50e20d8 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -25,7 +25,12 @@ class KubernetesApiClient #@@IsValidRunningNode = nil #@@IsLinuxCluster = nil @@KubeSystemNamespace = "kube-system" - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @LogPath = "/etc/omsagentwindows/kubernetes_client_log.txt" + else + @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + end @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@TokenStr = nil diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index a809087dc..73cf19fac 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -10,7 +10,12 @@ class MdmMetricsGenerator require_relative "constants" require_relative "oms_common" - @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @log_path = "/etc/omsagentwindows/mdm_metrics_generator.log" + else + @log_path = "/var/opt/microsoft/docker-cimprov/log/mdm_metrics_generator.log" + end @log = Logger.new(@log_path, 1, 5000000) @@hostName = (OMS::Common.get_hostname) diff --git a/source/plugins/ruby/arc_k8s_cluster_identity.rb b/source/plugins/ruby/arc_k8s_cluster_identity.rb index 552dafb1f..39b8c1c96 100644 --- a/source/plugins/ruby/arc_k8s_cluster_identity.rb +++ b/source/plugins/ruby/arc_k8s_cluster_identity.rb @@ -18,15 +18,20 @@ class ArcK8sClusterIdentity @@crd_resource_uri_template = "%{kube_api_server_url}/apis/%{cluster_config_crd_api_version}/namespaces/%{cluster_identity_resource_namespace}/azureclusteridentityrequests/%{cluster_identity_resource_name}" @@secret_resource_uri_template = "%{kube_api_server_url}/api/v1/namespaces/%{cluster_identity_token_secret_namespace}/secrets/%{token_secret_name}" @@azure_monitor_custom_metrics_audience = "https://monitoring.azure.com/" - @@cluster_identity_request_kind = "AzureClusterIdentityRequest" + @@cluster_identity_request_kind = "AzureClusterIdentityRequest" def initialize - @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @LogPath = "/etc/omsagentwindows/arc_k8s_cluster_identity.log" + else + @LogPath = "/var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log" + end @log = Logger.new(@LogPath, 1, 5000000) @log.info "initialize start @ #{Time.now.utc.iso8601}" @token_expiry_time = Time.now @cached_access_token = String.new - @isLastTokenRenewalUpdatePending = false + @isLastTokenRenewalUpdatePending = false @token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @kube_api_server_url = KubernetesApiClient.getKubeAPIServerUrl @@ -34,8 +39,8 @@ def initialize @log.warn "got api server url nil from KubernetesApiClient.getKubeAPIServerUrl @ #{Time.now.utc.iso8601}" end @http_client = get_http_client - @service_account_token = get_service_account_token - @extensionName = ENV["ARC_K8S_EXTENSION_NAME"] + @service_account_token = get_service_account_token + @extensionName = ENV["ARC_K8S_EXTENSION_NAME"] @log.info "extension name:#{@extensionName} @ #{Time.now.utc.iso8601}" @log.info "initialize complete @ #{Time.now.utc.iso8601}" end @@ -55,7 +60,7 @@ def get_cluster_identity_token() @isLastTokenRenewalUpdatePending = true else @log.warn "last token renewal update still pending @ #{Time.now.utc.iso8601}" - end + end end @log.info "get token reference from crd @ #{Time.now.utc.iso8601}" tokenReference = get_token_reference_from_crd @@ -68,7 +73,7 @@ def get_cluster_identity_token() token = get_token_from_secret(token_secret_name, token_secret_data_name) if !token.nil? @cached_access_token = token - @isLastTokenRenewalUpdatePending = false + @isLastTokenRenewalUpdatePending = false else @log.warn "got token nil from secret: #{@token_secret_name}" end @@ -141,7 +146,7 @@ def get_token_reference_from_crd() create_request.body = crd_request_body_json create_response = @http_client.request(create_request) @log.info "Got response of #{create_response.code} for POST #{crd_request_uri} @ #{Time.now.utc.iso8601}" - end + end rescue => err @log.warn "get_token_reference_from_crd call failed: #{err}" ApplicationInsightsUtility.sendExceptionTelemetry(err, { "FeatureArea" => "MDM" }) @@ -159,7 +164,7 @@ def renew_near_expiry_token() cluster_identity_resource_namespace: @@cluster_identity_resource_namespace, cluster_identity_resource_name: @@cluster_identity_resource_name, } - update_crd_request_body = { 'status': {'expirationTime': ''} } + update_crd_request_body = { 'status': { 'expirationTime': "" } } update_crd_request_body_json = update_crd_request_body.to_json update_crd_request_uri = crd_request_uri + "/status" update_request = Net::HTTP::Patch.new(update_crd_request_uri) @@ -234,9 +239,9 @@ def get_crd_request_body body["metadata"]["namespace"] = @@cluster_identity_resource_namespace body["spec"] = {} body["spec"]["audience"] = @@azure_monitor_custom_metrics_audience - if !@extensionName.nil? && !@extensionName.empty? - body["spec"]["resourceId"] = @extensionName - end + if !@extensionName.nil? && !@extensionName.empty? + body["spec"]["resourceId"] = @extensionName + end return body end end diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 62dcf31dc..9c6b661b0 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -2,7 +2,7 @@ # frozen_string_literal: true -require 'fluent/plugin/filter' +require "fluent/plugin/filter" module Fluent::Plugin require "logger" @@ -28,6 +28,12 @@ class CAdvisor2MdmFilter < Filter @@metric_threshold_hash = {} @@controller_type = "" + @@isWindows = false + @@os_type = ENV["OS_TYPE"] + if !@@os_type.nil? && !@@os_type.empty? && @@os_type.strip.casecmp("windows") == 0 + @@isWindows = true + end + def initialize super end @@ -130,15 +136,17 @@ def flushMetricTelemetry # Also send for PV usage metrics begin - pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs - pvTimeDifferenceInMinutes = pvTimeDifference / 60 - if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - pvProperties = {} - pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] - pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold - ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) - @pvExceededUsageThreshold = false - @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i + if !@@isWindows.nil? && @@isWindows == false + pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs + pvTimeDifferenceInMinutes = pvTimeDifference / 60 + if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + pvProperties = {} + pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] + pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) + @pvExceededUsageThreshold = false + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i + end end rescue => errorStr @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry" @@ -346,7 +354,6 @@ def ensure_cpu_memory_capacity_set # cpu_capacity and memory_capacity keep initialized value of 0.0 @log.error "Error getting capacity_from_kubelet: cpu_capacity and memory_capacity" end - end end diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index 781042cea..b3f9bd08b 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -1,16 +1,20 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin - class CAdvisor_Perf_Input < Input Fluent::Plugin.register_input("cadvisor_perf", self) + @@isWindows = false + @@os_type = ENV["OS_TYPE"] + if !@@os_type.nil? && !@@os_type.empty? && @@os_type.strip.casecmp("windows") == 0 + @@isWindows = true + end def initialize super require "yaml" - require 'yajl/json_gem' + require "yajl/json_gem" require "time" require_relative "CAdvisorMetricsAPIClient" @@ -69,31 +73,32 @@ def enumerate() router.emit_stream(@containerhealthtag, eventStream) if eventStream router.emit_stream(@nodehealthtag, eventStream) if eventStream - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("cAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") end #start GPU InsightsMetrics items begin - containerGPUusageInsightsMetricsDataItems = [] - containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) + if !@@isWindows.nil? && @@isWindows == false + containerGPUusageInsightsMetricsDataItems = [] + containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end - router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream - router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream - - if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) - $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream + + if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) + $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") + end end rescue => errorStr $log.warn "Failed when processing GPU Usage metrics in_cadvisor_perf : #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end + end #end GPU InsightsMetrics items rescue => errorStr diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 61e823ea6..9ab2474b1 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin class Win_CAdvisor_Perf_Input < Input @@ -20,7 +20,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -57,7 +57,7 @@ def enumerate() begin timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - @@istestvar = ENV["ISTEST"] + @@istestvar = ENV["ISTEST"] #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() @@ -79,7 +79,6 @@ def enumerate() end end router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@mdmtag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && eventStream.count > 0) $log.info("winCAdvisorPerfEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -89,10 +88,10 @@ def enumerate() begin containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: winNode, metricTime: Time.now.utc.iso8601)) - insightsMetricsEventStream = Fluent::MultiEventStream.new + insightsMetricsEventStream = Fluent::MultiEventStream.new containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index e2c731b79..22bc87c0e 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -9,7 +9,12 @@ require "bigdecimal" class KubeletUtils - @log_path = "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" + @os_type = ENV["OS_TYPE"] + if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @log_path = "/etc/omsagentwindows/filter_cadvisor2mdm.log" + else + @log_path = "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" + end @log = Logger.new(@log_path, 1, 5000000) class << self From e4da5193c13162d1556999198b7572ce687a0c78 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Thu, 27 May 2021 20:23:25 -0700 Subject: [PATCH 107/194] updating json gem to address CVE-2020-10663 (#567) * updating json gem to address CVE-2020-10663 * updating json gem to address CVE-2020-10663 --- kubernetes/linux/setup.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 3d00e4c57..b8829e13b 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -44,11 +44,13 @@ sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/ sudo apt-get update sudo apt-get install td-agent-bit=1.6.8 -y -# install ruby2.6 +# install ruby2.6 sudo apt-get install software-properties-common -y sudo apt-add-repository ppa:brightbox/ruby-ng -y sudo apt-get update sudo apt-get install ruby2.6 ruby2.6-dev gcc make -y +# to fix CVE-2020-10663 +gem update json -v 2.5.1 # fluentd v1 gem gem install fluentd -v "1.12.2" --no-document fluentd --setup ./fluent From 49486a8df6bc06336fc11ab51b0d19ac36497006 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Thu, 27 May 2021 21:37:34 -0700 Subject: [PATCH 108/194] update recommended alerts readme (#570) @dcbrown16 pointed out that this page links to the wrong document in [this issue](https://github.com/microsoft/Docker-Provider/issues/475). The content in the currently linked page is identitical to the page which should be linked, so it's a simple fix. --- alerts/recommended_alerts_ARM/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alerts/recommended_alerts_ARM/README.md b/alerts/recommended_alerts_ARM/README.md index 81dec8929..53e5a9ad7 100644 --- a/alerts/recommended_alerts_ARM/README.md +++ b/alerts/recommended_alerts_ARM/README.md @@ -24,7 +24,7 @@ Completed job count|Calculates number of jobs completed more than six hours ago. ### How to enable with a Resource Manager template 1. Download one or all of the available templates that describe how to create the alert. -2. Create and use a [parameters file](https://review.docs.microsoft.com/azure/azure-resource-manager/templates/parameter-files) as a JSON to set the values required to create the alert rule. +2. Create and use a [parameters file](https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/parameter-files) as a JSON to set the values required to create the alert rule. 3. Deploy the template from the Azure portal, PowerShell, or Azure CLI. For step by step procedures on how to enable alerts via Resource manager, please go [here.](https://aka.ms/ci_alerts_arm) From ef23fc684f7dfb62e61e6ae0634c7ba02a39ca20 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Fri, 28 May 2021 15:10:53 -0700 Subject: [PATCH 109/194] trying again to fix the json gem (#571) * trying again to fix the json gem * removing installation of newer json gem --- kubernetes/linux/setup.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index b8829e13b..17cfb3f77 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -49,8 +49,6 @@ sudo apt-get install software-properties-common -y sudo apt-add-repository ppa:brightbox/ruby-ng -y sudo apt-get update sudo apt-get install ruby2.6 ruby2.6-dev gcc make -y -# to fix CVE-2020-10663 -gem update json -v 2.5.1 # fluentd v1 gem gem install fluentd -v "1.12.2" --no-document fluentd --setup ./fluent From cfa804a1adeb3eb4e82d78f14569e3238e2f6dbd Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Tue, 1 Jun 2021 15:37:14 -0700 Subject: [PATCH 110/194] Addressing PR comments for - https://github.com/microsoft/Docker-Provider/pull/568 (#569) --- source/plugins/ruby/ApplicationInsightsUtility.rb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index 74d08c1e6..31f9503cd 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -139,12 +139,12 @@ def initializeUtility() def getContainerRuntimeInfo() begin - # Not doing this for windows since docker is being deprecated soon and we dont want to bring in the socket dependency. - if !@@isWindows.nil? && @@isWindows == false - containerRuntime = ENV[@@EnvContainerRuntime] - if !containerRuntime.nil? && !containerRuntime.empty? - # DockerVersion field holds either containerRuntime for non-docker or Dockerversion if its docker - @@CustomProperties["DockerVersion"] = containerRuntime + containerRuntime = ENV[@@EnvContainerRuntime] + if !containerRuntime.nil? && !containerRuntime.empty? + # DockerVersion field holds either containerRuntime for non-docker or Dockerversion if its docker + @@CustomProperties["DockerVersion"] = containerRuntime + # Not doing this for windows since docker is being deprecated soon and we dont want to bring in the socket dependency. + if !@@isWindows.nil? && @@isWindows == false if containerRuntime.casecmp("docker") == 0 dockerInfo = DockerApiClient.dockerInfo if (!dockerInfo.nil? && !dockerInfo.empty?) From 0d3e4a13ef07e44ea834ced3c317eee98b694c16 Mon Sep 17 00:00:00 2001 From: Tsubasa Nomura Date: Fri, 11 Jun 2021 07:35:44 +0900 Subject: [PATCH 111/194] Mem_Buf_limit is configurable via ConfigMap (#574) --- .../installer/scripts/td-agent-bit-conf-customizer.rb | 6 ++++++ build/linux/installer/conf/td-agent-bit.conf | 2 +- .../linux/installer/scripts/tomlparser-agent-config.rb | 10 ++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb index ea1536866..82c6c1d17 100644 --- a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb @@ -6,6 +6,7 @@ @default_service_interval = "1" @default_buffer_chunk_size = "1" @default_buffer_max_size = "1" +@default_mem_buf_limit = "10" def is_number?(value) true if Integer(value) rescue false @@ -19,6 +20,7 @@ def substituteFluentBitPlaceHolders interval = ENV["FBIT_SERVICE_FLUSH_INTERVAL"] bufferChunkSize = ENV["FBIT_TAIL_BUFFER_CHUNK_SIZE"] bufferMaxSize = ENV["FBIT_TAIL_BUFFER_MAX_SIZE"] + memBufLimit = ENV["FBIT_TAIL_MEM_BUF_LIMIT"] serviceInterval = (!interval.nil? && is_number?(interval) && interval.to_i > 0 ) ? interval : @default_service_interval serviceIntervalSetting = "Flush " + serviceInterval @@ -32,8 +34,12 @@ def substituteFluentBitPlaceHolders tailBufferMaxSize = tailBufferChunkSize end + tailMemBufLimit = (!memBufLimit.nil? && is_number?(memBufLimit) && memBufLimit.to_i > 10) ? memBufLimit : @default_mem_buf_limit + tailMemBufLimitSetting = "Mem_Buf_Limit " + tailMemBufLimit + "m" + text = File.read(@td_agent_bit_conf_path) new_contents = text.gsub("${SERVICE_FLUSH_INTERVAL}", serviceIntervalSetting) + new_contents = new_contents.gsub("${TAIL_MEM_BUF_LIMIT}", tailMemBufLimitSetting) if !tailBufferChunkSize.nil? new_contents = new_contents.gsub("${TAIL_BUFFER_CHUNK_SIZE}", "Buffer_Chunk_Size " + tailBufferChunkSize + "m") else diff --git a/build/linux/installer/conf/td-agent-bit.conf b/build/linux/installer/conf/td-agent-bit.conf index 045aefcaf..beba6a3ca 100644 --- a/build/linux/installer/conf/td-agent-bit.conf +++ b/build/linux/installer/conf/td-agent-bit.conf @@ -19,7 +19,7 @@ DB /var/log/omsagent-fblogs.db DB.Sync Off Parser docker - Mem_Buf_Limit 10m + ${TAIL_MEM_BUF_LIMIT} ${TAIL_BUFFER_CHUNK_SIZE} ${TAIL_BUFFER_MAX_SIZE} Rotate_Wait 20 diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/linux/installer/scripts/tomlparser-agent-config.rb index e587909e5..4daaf6a0c 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/linux/installer/scripts/tomlparser-agent-config.rb @@ -59,6 +59,7 @@ @fbitFlushIntervalSecs = 0 @fbitTailBufferChunkSizeMBs = 0 @fbitTailBufferMaxSizeMBs = 0 +@fbitTailMemBufLimitMBs = 0 def is_number?(value) @@ -168,6 +169,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) @fbitTailBufferMaxSizeMBs = @fbitTailBufferChunkSizeMBs puts "config::warn: since tail_buf_maxsize_megabytes not provided hence using tail_buf_maxsize_megabytes=#{@fbitTailBufferMaxSizeMBs} which is same as the value of tail_buf_chunksize_megabytes" end + + fbitTailMemBufLimitMBs = fbit_config[:tail_mem_buf_limit_megabytes] + if !fbitTailMemBufLimitMBs.nil? && is_number?(fbitTailMemBufLimitMBs) && fbitTailMemBufLimitMBs.to_i > 0 + @fbitTailMemBufLimitMBs = fbitTailMemBufLimitMBs.to_i + puts "Using config map value: tail_mem_buf_limit_megabytes = #{@fbitTailMemBufLimitMBs}" + end end end rescue => errorStr @@ -212,6 +219,9 @@ def populateSettingValuesFromConfigMap(parsedConfig) if @fbitTailBufferMaxSizeMBs > 0 file.write("export FBIT_TAIL_BUFFER_MAX_SIZE=#{@fbitTailBufferMaxSizeMBs}\n") end + if @fbitTailMemBufLimitMBs > 0 + file.write("export FBIT_TAIL_MEM_BUF_LIMIT=#{@fbitTailMemBufLimitMBs}\n") + end # Close file after writing all environment variables file.close else From 50b99fff5c97780601f438610e3126c7a5df7401 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 11 Jun 2021 10:59:03 -0700 Subject: [PATCH 112/194] add log rotation settings for fluentd logs (#577) --- kubernetes/linux/main.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index b21ed6b96..b9e338fa9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -478,10 +478,10 @@ fi if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -e "/etc/config/kube.conf" ]; then echo "*** starting fluentd v1 in daemonset" - fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log & + fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else echo "*** starting fluentd v1 in replicaset" - fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log & + fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & fi fi From 4cebe73a6a000b91183b4bdf45b5cd2f2d069d3c Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 11 Jun 2021 12:28:21 -0700 Subject: [PATCH 113/194] Gangams/release 06112021 (#578) * updates related to ciprod06112021 release * minor update --- ReleaseNotes.md | 17 +++++++++++++++++ build/version | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 14 +++++++------- kubernetes/windows/Dockerfile | 2 +- 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index d7d6de6af..394caba09 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,23 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 06/11/2021 - +##### Version microsoft/oms:ciprod06112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021 (linux) +##### Version microsoft/oms:win-ciprod06112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021 (windows) + - Linux Agent + - Removal of base omsagent dependency + - Using MDSD version 1.10.1 as base agent for all the supported LA data types + - Ruby version upgrade to 2.6 i.e. same version as windows agent + - Upgrade FluentD gem version to 1.12.2 + - All the Ruby Fluentd Plugins upgraded to v1 as per Fluentd guidance + - Windows Agent + - CA cert changes for airgapped clouds + - Send perf metrics to MDM from windows daemonset + - FluentD gem version upgrade from 1.10.2 to 1.12.2 to make same version as Linux Agent + - Doc updates + - README updates related to OSM preview release for Arc K8s + - README updates related to recommended alerts + ### 05/20/2021 - ##### Version microsoft/oms:ciprod05202021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021 (linux) ##### No Windows changes with this release, win-ciprod04222021 still current. diff --git a/build/version b/build/version index d70d1f9bc..95d20e931 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=15 -CONTAINER_BUILDVERSION_MINOR=2 +CONTAINER_BUILDVERSION_MAJOR=16 +CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210512 +CONTAINER_BUILDVERSION_DATE=20210611 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 3ad3cd315..1ae7bef61 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod05202021 +ARG IMAGE_TAG=ciprod06112021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 4290e1d59..617c81f38 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "15.2.0-0" + dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" imagePullPolicy: IfNotPresent resources: limits: @@ -446,7 +446,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" imagePullPolicy: IfNotPresent resources: limits: @@ -583,13 +583,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "15.2.0-0" + dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" imagePullPolicy: IfNotPresent resources: limits: @@ -750,7 +750,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "15.0.0-0" + dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -760,7 +760,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 5a5298d0b..997b2f310 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod04222021 +ARG IMAGE_TAG=win-ciprod06112021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From adabaf93ac2e3a334e3e921548b80ac3b0d32487 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 11 Jun 2021 12:56:56 -0700 Subject: [PATCH 114/194] release note update (#579) --- ReleaseNotes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 394caba09..266dadf1c 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -20,6 +20,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t - Ruby version upgrade to 2.6 i.e. same version as windows agent - Upgrade FluentD gem version to 1.12.2 - All the Ruby Fluentd Plugins upgraded to v1 as per Fluentd guidance + - Fluent-bit tail plugin Mem_Buf_limit is configurable via ConfigMap - Windows Agent - CA cert changes for airgapped clouds - Send perf metrics to MDM from windows daemonset From 0c701207300557191cc7adec9967215c4f8daa8c Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 14 Jun 2021 14:39:50 -0700 Subject: [PATCH 115/194] Make sidecar fluentbit chunk size configurable (#573) --- Documentation/AgentSettings/ReadMe.md | 26 +++++ .../scripts/tomlparser-prom-agent-config.rb | 102 ++++++++++++++++++ .../conf/td-agent-bit-prom-side-car.conf | 6 +- .../installer/datafiles/base_container.data | 3 +- kubernetes/container-azm-ms-agentconfig.yaml | 10 ++ .../linux/defaultpromenvvariables-sidecar | 3 + kubernetes/linux/main.sh | 15 +++ source/plugins/go/src/telemetry.go | 4 + 8 files changed, 165 insertions(+), 4 deletions(-) create mode 100644 Documentation/AgentSettings/ReadMe.md create mode 100644 build/common/installer/scripts/tomlparser-prom-agent-config.rb diff --git a/Documentation/AgentSettings/ReadMe.md b/Documentation/AgentSettings/ReadMe.md new file mode 100644 index 000000000..3e55d7d44 --- /dev/null +++ b/Documentation/AgentSettings/ReadMe.md @@ -0,0 +1,26 @@ +## Configurable agent settings for high scale prometheus metric scraping using pod annotations with prometheus sidecar. + +Container Insights agent runs native prometheus telegraf plugin to scrape prometheus metrics using pod annotations. +The metrics scraped from the telegraf plugin are sent to the fluent bit tcp listener. +In order to support higher volumes of prometheus metrics scraping some of the tcp listener settings can be tuned. +[Fluent Bit TCP listener](https://docs.fluentbit.io/manual/pipeline/inputs/tcp) + +* Chunk Size - This can be increased to process bigger chunks of data. + +* Buffer Size - This should be greater than or equal to the chunk size. + +* Mem Buf Limit - This can be increased to increase the buffer size. But the memory limit on the sidecar also needs to be increased accordingly. +Note that this can only be achieved using helm chart today. + + +** Note - The LA ingestion team also states that higher chunk sizes might not necessarily mean higher throughput since there are pipeline limitations. + +``` + agent-settings: |- + # prometheus scrape fluent bit settings for high scale + # buffer size should be greater than or equal to chunk size else we set it to chunk size. + [agent_settings.prometheus_fbit_settings] + tcp_listener_chunk_size = 10 + tcp_listener_buffer_size = 10 + tcp_listener_mem_buf_limit = 200 +``` diff --git a/build/common/installer/scripts/tomlparser-prom-agent-config.rb b/build/common/installer/scripts/tomlparser-prom-agent-config.rb new file mode 100644 index 000000000..be9d08e59 --- /dev/null +++ b/build/common/installer/scripts/tomlparser-prom-agent-config.rb @@ -0,0 +1,102 @@ +#!/usr/local/bin/ruby + +#this should be require relative in Linux and require in windows, since it is a gem install on windows +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + require "tomlrb" +else + require_relative "tomlrb" +end + +require_relative "ConfigParseErrorLogger" + +@configMapMountPath = "/etc/config/settings/agent-settings" +@configSchemaVersion = "" + +@promFbitChunkSize = 10 +@promFbitBufferSize = 10 +@promFbitMemBufLimit = 200 + +def is_number?(value) + true if Integer(value) rescue false +end + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for sidecar agent settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for sidecar agent settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for sidecar agent settings : #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used as environment variables +def populateSettingValuesFromConfigMap(parsedConfig) + begin + if !parsedConfig.nil? && !parsedConfig[:agent_settings].nil? + # fbit config settings + prom_fbit_config = parsedConfig[:agent_settings][:prometheus_fbit_settings] + if !prom_fbit_config.nil? + chunk_size = prom_fbit_config[:tcp_listener_chunk_size] + if !chunk_size.nil? && is_number?(chunk_size) && chunk_size.to_i > 0 + @promFbitChunkSize = chunk_size.to_i + puts "Using config map value: AZMON_SIDECAR_FBIT_CHUNK_SIZE = #{@promFbitChunkSize.to_s + "m"}" + end + buffer_size = prom_fbit_config[:tcp_listener_buffer_size] + if !buffer_size.nil? && is_number?(buffer_size) && buffer_size.to_i > 0 + @promFbitBufferSize = buffer_size.to_i + puts "Using config map value: AZMON_SIDECAR_FBIT_BUFFER_SIZE = #{@promFbitBufferSize.to_s + "m"}" + if @promFbitBufferSize < @promFbitChunkSize + @promFbitBufferSize = @promFbitChunkSize + puts "Setting Fbit buffer size equal to chunk size since it is set to less than chunk size - AZMON_SIDECAR_FBIT_BUFFER_SIZE = #{@promFbitBufferSize.to_s + "m"}" + end + end + mem_buf_limit = prom_fbit_config[:tcp_listener_mem_buf_limit] + if !mem_buf_limit.nil? && is_number?(mem_buf_limit) && mem_buf_limit.to_i > 0 + @promFbitMemBufLimit = mem_buf_limit.to_i + puts "Using config map value: AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT = #{@promFbitMemBufLimit.to_s + "m"}" + end + end + end + rescue => errorStr + puts "config::error:Exception while reading config settings for sidecar agent configuration setting - #{errorStr}, using defaults" + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Sidecar Agent Config Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version , so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end + @enable_health_model = false +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("side_car_fbit_config_env_var", "w") + +if !file.nil? + file.write("export AZMON_SIDECAR_FBIT_CHUNK_SIZE=#{@promFbitChunkSize.to_s + "m"}\n") + file.write("export AZMON_SIDECAR_FBIT_BUFFER_SIZE=#{@promFbitBufferSize.to_s + "m"}\n") + file.write("export AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT=#{@promFbitMemBufLimit.to_s + "m"}\n") + # Close file after writing all environment variables + file.close +else + puts "Exception while opening file for writing config environment variables" + puts "****************End Sidecar Agent Config Processing********************" +end diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 8a69f7995..2c85a4200 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -29,9 +29,9 @@ Tag oms.container.perf.telegraf.* Listen 0.0.0.0 Port 25229 - Chunk_Size 10m - Buffer_Size 10m - Mem_Buf_Limit 200m + Chunk_Size ${AZMON_SIDECAR_FBIT_CHUNK_SIZE} + Buffer_Size ${AZMON_SIDECAR_FBIT_BUFFER_SIZE} + Mem_Buf_Limit ${AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT} [OUTPUT] Name oms diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index de8ccbba0..88c790be3 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -42,7 +42,8 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/docker-cimprov/bin/TelegrafTCPErrorTelemetry.sh; build/linux/installer/scripts/TelegrafTCPErrorTelemetry.sh; 755; root; root /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/common/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root -/opt/tomlparser-mdm-metrics-config.rb; build/common/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-prom-agent-config.rb; build/common/installer/scripts/tomlparser-prom-agent-config.rb; 755; root; root +/opt/tomlparser-mdm-metrics-config.rb; build/common/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 543f270c1..21b31f76f 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -135,6 +135,16 @@ data: [integrations.azure_network_policy_manager] collect_basic_metrics = false collect_advanced_metrics = false + +# Doc - https://github.com/microsoft/Docker-Provider/blob/ci_prod/Documentation/AgentSettings/ReadMe.md + agent-settings: |- + # prometheus scrape fluent bit settings for high scale + # buffer size should be greater than or equal to chunk size else we set it to chunk size. + [agent_settings.prometheus_fbit_settings] + tcp_listener_chunk_size = 10 + tcp_listener_buffer_size = 10 + tcp_listener_mem_buf_limit = 200 + metadata: name: container-azm-ms-agentconfig namespace: kube-system diff --git a/kubernetes/linux/defaultpromenvvariables-sidecar b/kubernetes/linux/defaultpromenvvariables-sidecar index 3301488d8..68388f88e 100644 --- a/kubernetes/linux/defaultpromenvvariables-sidecar +++ b/kubernetes/linux/defaultpromenvvariables-sidecar @@ -7,3 +7,6 @@ export AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER="" export AZMON_TELEGRAF_OSM_PROM_PLUGINS="" export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_LABEL_SELECTOR="kubernetes_label_selector = ''" export AZMON_TELEGRAF_CUSTOM_PROM_KUBERNETES_FIELD_SELECTOR="kubernetes_field_selector = ''" +export AZMON_SIDECAR_FBIT_CHUNK_SIZE="10m" +export AZMON_SIDECAR_FBIT_BUFFER_SIZE="10m" +export AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT="200m" diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index b9e338fa9..ec348bba3 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -306,6 +306,21 @@ if [ -e "telemetry_prom_config_env_var" ]; then source telemetry_prom_config_env_var fi +#Parse sidecar agent settings for custom configuration +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + #Parse the agent configmap to create a file with new custom settings. + /usr/bin/ruby2.6 tomlparser-prom-agent-config.rb + #Sourcing config environment variable file if it exists + if [ -e "side_car_fbit_config_env_var" ]; then + cat side_car_fbit_config_env_var | while read line; do + echo $line >> ~/.bashrc + done + source side_car_fbit_config_env_var + fi + fi +fi + #Parse the configmap to set the right environment variables for MDM metrics configuration for Alerting. if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 4750b4624..debe003e4 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -161,6 +161,10 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { telemetryDimensions["OsmNamespaceCount"] = strconv.Itoa(osmNamespaceCount) } + telemetryDimensions["PromFbitChunkSize"] = os.Getenv("AZMON_SIDECAR_FBIT_CHUNK_SIZE") + telemetryDimensions["PromFbitBufferSize"] = os.Getenv("AZMON_SIDECAR_FBIT_BUFFER_SIZE") + telemetryDimensions["PromFbitMemBufLimit"] = os.Getenv("AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT") + SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) } else { From a7a2d739f1ef22a507637253d7d5d307e7afeec4 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 15 Jun 2021 09:36:55 -0700 Subject: [PATCH 116/194] Fix vulnerabilities (#583) * test * test1 * test-2 * test-3 * 3 * 4 * test * 2 * 3 * 4 * 5 * 6 * rename gem for windows * fix * fix --- .github/workflows/pr-checker.yml | 99 ++++++++++++++++++++++++++++++++ kubernetes/linux/setup.sh | 9 ++- kubernetes/windows/setup.ps1 | 12 +++- source/plugins/go/src/go.mod | 1 + source/plugins/go/src/go.sum | 7 +++ 5 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/pr-checker.yml diff --git a/.github/workflows/pr-checker.yml b/.github/workflows/pr-checker.yml new file mode 100644 index 000000000..c75e6dc24 --- /dev/null +++ b/.github/workflows/pr-checker.yml @@ -0,0 +1,99 @@ +name: pullrequest-build-and-scan +on: + pull_request: + types: [opened, synchronize, reopened] + branches: + - ci_dev + - ci_prod + paths-ignore: + - '**.md' +jobs: + LINUX-build-and-scan: + runs-on: ubuntu-latest + steps: + - name: Set-workflow-initiator + run: echo "Initiated by - ${GITHUB_ACTOR}" + - name: Set-branch-name-for-pr + if: ${{ github.event_name == 'pull_request' }} + run: echo "BRANCH_NAME=$(echo ${GITHUB_HEAD_REF} | tr / _)" >> $GITHUB_ENV + - name: Set-Env + run: echo "ENV=dev" >> $GITHUB_ENV + - name: Set-ACR-Registry + run: echo "ACR_REGISTRY=containerinsightsprod.azurecr.io" >> $GITHUB_ENV + - name: Set-ACR-Repository + run: echo "ACR_REPOSITORY=/public/azuremonitor/containerinsights/cidev" >> $GITHUB_ENV + - name: Set-image-tag-name + run: echo "IMAGE_TAG_NAME=cidev" >> $GITHUB_ENV + - name: Set-image-tag-suffix + run: echo "IMAGE_TAG_DATE=$(date +%m-%d-%Y)" >> $GITHUB_ENV + - name: Set-commit-sha + run: echo "COMMIT_SHA=${GITHUB_SHA::8}" >> $GITHUB_ENV + - name: Set-image-tag + run: echo "IMAGETAG=${ACR_REGISTRY}${ACR_REPOSITORY}:${IMAGE_TAG_NAME}-${BRANCH_NAME}-${IMAGE_TAG_DATE}-${COMMIT_SHA}" >> $GITHUB_ENV + - name: Set-image-telemetry-tag + run: echo "IMAGETAG_TELEMETRY=${IMAGE_TAG_NAME}-${BRANCH_NAME}-${IMAGE_TAG_DATE}-${COMMIT_SHA}" >> $GITHUB_ENV + - name: Set-Helm-OCI-Experimental-feature + run: echo "HELM_EXPERIMENTAL_OCI=1" >> $GITHUB_ENV + - name: Set-Helm-chart-version + run: echo "HELM_CHART_VERSION=0.0.1" >> $GITHUB_ENV + - name: Set-Helm-tag + run: echo "HELMTAG=${ACR_REGISTRY}${ACR_REPOSITORY}:${IMAGE_TAG_NAME}-chart-${BRANCH_NAME}-${HELM_CHART_VERSION}-${IMAGE_TAG_DATE}-${COMMIT_SHA}" >> $GITHUB_ENV + - name: Checkout-code + uses: actions/checkout@v2 + - name: Show-versions-On-build-machine + run: lsb_release -a && go version && helm version && docker version + - name: Install-build-dependencies + run: sudo apt-get install build-essential -y + - name: Build-source-code + run: cd ./build/linux/ && make + - name: Create-docker-image + run: | + cd ./kubernetes/linux/ && docker build . --file Dockerfile -t $IMAGETAG --build-arg IMAGE_TAG=$IMAGETAG_TELEMETRY + - name: List-docker-images + run: docker images --digests --all + - name: Run-trivy-scanner-on-docker-image + uses: aquasecurity/trivy-action@master + with: + image-ref: "${{ env.IMAGETAG }}" + format: 'table' + severity: 'CRITICAL,HIGH' + vuln-type: 'os,library' + skip-dirs: 'opt/telegraf' + exit-code: '1' + timeout: '5m0s' + WINDOWS-build: + runs-on: windows-latest + steps: + - name: Set-workflow-initiator + run: echo ("Initiated by -" + $env:GITHUB_ACTOR) + - name: Set-branch-name-for-pr + if: ${{ github.event_name == 'pull_request' }} + run: echo ("BRANCH_NAME=" + $env:GITHUB_HEAD_REF.replace('/','_')) >> $env:GITHUB_ENV + - name: Set-Env + run: echo ("ENV=dev") >> $env:GITHUB_ENV + - name: Set-ACR-Registry + run: echo ("ACR_REGISTRY=containerinsightsprod.azurecr.io") >> $env:GITHUB_ENV + - name: Set-ACR-Repository + run: echo ("ACR_REPOSITORY=/public/azuremonitor/containerinsights/cidev") >> $env:GITHUB_ENV + - name: Set-image-tag-name + run: echo ("IMAGE_TAG_NAME=cidev-win") >> $env:GITHUB_ENV + - name: Set-image-tag-suffix + run: echo ("IMAGE_TAG_DATE="+ (Get-Date -Format "MM-dd-yyyy")) >> $env:GITHUB_ENV + - name: Set-commit-sha + run: echo ("COMMIT_SHA=" + $env:GITHUB_SHA.SubString(0,8)) >> $env:GITHUB_ENV + - name: Set-image-tag + run: echo ("IMAGETAG=" + $env:ACR_REGISTRY + $env:ACR_REPOSITORY + ":" + $env:IMAGE_TAG_NAME + "-" + $env:BRANCH_NAME + "-" + $env:IMAGE_TAG_DATE + "-" + $env:COMMIT_SHA) >> $env:GITHUB_ENV + - name: Set-image-telemetry-tag + run: echo ("IMAGETAG_TELEMETRY=" + $env:IMAGE_TAG_NAME + "-" + $env:BRANCH_NAME + "-" + $env:IMAGE_TAG_DATE + "-" + $env:COMMIT_SHA) >> $env:GITHUB_ENV + - name: Checkout-code + uses: actions/checkout@v2 + - name: Show-versions-On-build-machine + run: systeminfo && go version && docker version + - name: Build-source-code + run: cd ./build/windows/ && & .\Makefile.ps1 + - name: Create-docker-image + run: | + cd ./kubernetes/windows/ && docker build . --file Dockerfile -t $env:IMAGETAG --build-arg IMAGE_TAG=$env:IMAGETAG_TELEMETRY + - name: List-docker-images + run: docker images --digests --all + diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 17cfb3f77..ad7cc2232 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -27,7 +27,6 @@ sudo apt-get install jq=1.5+dfsg-2 -y #used to setcaps for ruby process to read /proc/env sudo apt-get install libcap2-bin -y -#1.18 pre-release wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz @@ -63,3 +62,11 @@ rm -f $TMPDIR/envmdsd # Remove settings for cron.daily that conflict with the node's cron.daily. Since both are trying to rotate the same files # in /var/log at the same time, the rotation doesn't happen correctly and then the *.1 file is forever logged to. rm /etc/logrotate.d/alternatives /etc/logrotate.d/apt /etc/logrotate.d/azure-mdsd /etc/logrotate.d/rsyslog + +#Remove gemfile.lock for http_parser gem 0.6.0 +#see - https://github.com/fluent/fluentd/issues/3374 https://github.com/tmm1/http_parser.rb/issues/70 +if [ -e "/var/lib/gems/2.6.0/gems/http_parser.rb-0.6.0/Gemfile.lock" ]; then + #rename + echo "Renaming unused gemfile.lock for http_parser 0.6.0" + mv /var/lib/gems/2.6.0/gems/http_parser.rb-0.6.0/Gemfile.lock /var/lib/gems/2.6.0/gems/http_parser.rb-0.6.0/renamed_Gemfile_lock.renamed +fi diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index 25aad5e16..3e47b7eb2 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -65,6 +65,16 @@ Write-Host ('Extracting Certificate Generator Package') Expand-Archive -Path /opt/omsagentwindows/certificategenerator.zip -Destination /opt/omsagentwindows/certgenerator/ -Force Write-Host ('Finished Extracting Certificate Generator Package') +Write-Host ("Removing Install folder") + Remove-Item /installation -Recurse -Write-Host ("Removing Install folder") \ No newline at end of file +#Remove gemfile.lock for http_parser gem 0.6.0 +#see - https://github.com/fluent/fluentd/issues/3374 https://github.com/tmm1/http_parser.rb/issues/70 + +$gemfile = "\ruby26\lib\ruby\gems\2.6.0\gems\http_parser.rb-0.6.0\Gemfile.lock" +$gemfileFullPath = $Env:SYSTEMDRIVE + "\" + $gemfile +If (Test-Path -Path $gemfile ) { + Write-Host ("Renaming unused gemfile.lock for http_parser 0.6.0") + Rename-Item -Path $gemfileFullPath -NewName "renamed_Gemfile_lock.renamed" +} \ No newline at end of file diff --git a/source/plugins/go/src/go.mod b/source/plugins/go/src/go.mod index 3fd38a9bd..5b5c735e5 100644 --- a/source/plugins/go/src/go.mod +++ b/source/plugins/go/src/go.mod @@ -31,4 +31,5 @@ require ( k8s.io/api v0.0.0-20180628040859-072894a440bd // indirect k8s.io/apimachinery v0.0.0-20180621070125-103fd098999d k8s.io/client-go v8.0.0+incompatible + golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f ) diff --git a/source/plugins/go/src/go.sum b/source/plugins/go/src/go.sum index 52bb2ab04..64745749f 100644 --- a/source/plugins/go/src/go.sum +++ b/source/plugins/go/src/go.sum @@ -108,6 +108,10 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90Pveol golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413 h1:ULYEB3JvPRE/IfO+9uO7vKV/xzVTO7XPAwm8xbf4w2g= golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975 h1:/Tl7pH94bvbAAHBdZJT947M/+gp0+CqQXDtMRC0fseo= +golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f h1:aZp0e2vLN4MToVqnjNEYEtrEA8RH8U8FN1CU7JgqsPU= +golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= golang.org/x/net v0.0.0-20170809000501-1c05540f6879 h1:0rFa7EaCGdQPmZVbo9F7MNF65b8dyzS6EUnXjs9Cllk= golang.org/x/net v0.0.0-20170809000501-1c05540f6879/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -120,8 +124,11 @@ golang.org/x/sys v0.0.0-20171031081856-95c657629925/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201117132131-f5c789dd3221 h1:/ZHdbVpdR/jk3g30/d4yUL0JU9kksj8+F/bnQUVLGDM= +golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/text v0.0.0-20170810154203-b19bf474d317 h1:WKW+OPdYPlvOTVGHuMfjnIC6yY2SI93yFB0pZ7giBmQ= golang.org/x/text v0.0.0-20170810154203-b19bf474d317/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= From 154c11dd0cfe99c3b065e83967324b6e561aaa72 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 16 Jun 2021 10:14:12 -0700 Subject: [PATCH 117/194] Windows build optimization (#582) --- README.md | 26 ++++++++ kubernetes/windows/Dockerfile-dev-base-image | 43 +++++++++++++ kubernetes/windows/Dockerfile-dev-image | 45 +++++++++++++ .../build-and-publish-dev-docker-image.ps1 | 64 +++++++++++++++++++ .../dockerbuild/build-dev-base-image.ps1 | 32 ++++++++++ 5 files changed, 210 insertions(+) create mode 100644 kubernetes/windows/Dockerfile-dev-base-image create mode 100644 kubernetes/windows/Dockerfile-dev-image create mode 100644 kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 create mode 100644 kubernetes/windows/dockerbuild/build-dev-base-image.ps1 diff --git a/README.md b/README.md index 555234c61..73bf858cd 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,32 @@ powershell -ExecutionPolicy bypass # switch to powershell if you are not on pow .\build-and-publish-docker-image.ps1 -image /: # trigger build code and image and publish docker hub or acr ``` +##### Developer Build optimizations +If you do not want to build the image from scratch every time you make changes during development,you can choose to build the docker images that are separated out by +* Base image and dependencies including agent bootstrap(setup.ps1) +* Agent conf and plugin changes + +To do this, the very first time you start developing you would need to execute below instructions in elevated command prompt of powershell. +This builds the base image(omsagent-win-base) with all the package dependencies +``` +cd %userprofile%\Docker-Provider\kubernetes\windows\dockerbuild # based on your repo path +docker login # if you want to publish the image to acr then login to acr via `docker login ` +powershell -ExecutionPolicy bypass # switch to powershell if you are not on powershell already +.\build-dev-base-image.ps1 # builds base image and dependencies +``` + +And then run the script to build the image consisting of code and conf changes. +``` +.\build-and-publish-dev-docker-image.ps1 -image /: # trigger build code and image and publish docker hub or acr +``` + +For the subsequent builds, you can just run - + +``` +.\build-and-publish-dev-docker-image.ps1 -image /: # trigger build code and image and publish docker hub or acr +``` +###### Note - If you have changes in setup.ps1 and want to test those changes, uncomment the section consisting of setup.ps1 in the Dockerfile-dev-image file. + #### Option 2 - Using WSL2 to Build the Windows agent ##### On WSL2, Build Certificate Generator Source code and Out OMS Go plugin code diff --git a/kubernetes/windows/Dockerfile-dev-base-image b/kubernetes/windows/Dockerfile-dev-base-image new file mode 100644 index 000000000..9c6ae8db8 --- /dev/null +++ b/kubernetes/windows/Dockerfile-dev-base-image @@ -0,0 +1,43 @@ +FROM mcr.microsoft.com/windows/servercore:ltsc2019 +MAINTAINER OMSContainers@microsoft.com +LABEL vendor=Microsoft\ Corp \ + com.microsoft.product="Azure Monitor for containers" + +# Do not split this into multiple RUN! +# Docker creates a layer for every RUN-Statement +RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" +# Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools +RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ +&& choco install -y msys2 --version 20200903.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y vim + +# gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update +RUN refreshenv \ +&& ridk install 3 \ +&& echo gem: --no-document >> C:\ProgramData\gemrc \ +&& gem install cool.io -v 1.5.4 --platform ruby \ +&& gem install oj -v 3.3.10 \ +&& gem install json -v 2.2.0 \ +&& gem install fluentd -v 1.12.2 \ +&& gem install win32-service -v 1.0.1 \ +&& gem install win32-ipc -v 0.7.0 \ +&& gem install win32-event -v 0.6.3 \ +&& gem install windows-pr -v 1.2.6 \ +&& gem install tomlrb -v 1.3.0 \ +&& gem install gyoku -v 1.3.1 \ +&& gem sources --clear-all + +# Remove gem cache and chocolatey +RUN powershell -Command "Remove-Item -Force C:\ruby26\lib\ruby\gems\2.6.0\cache\*.gem; Remove-Item -Recurse -Force 'C:\ProgramData\chocolatey'" + +SHELL ["powershell"] + +ENV tmpdir /opt/omsagentwindows/scripts/powershell + +WORKDIR /opt/omsagentwindows/scripts/powershell + +# copy certificate generator binaries zip +COPY ./omsagentwindows/*.zip /opt/omsagentwindows/ + +COPY setup.ps1 /opt/omsagentwindows/scripts/powershell +RUN ./setup.ps1 \ No newline at end of file diff --git a/kubernetes/windows/Dockerfile-dev-image b/kubernetes/windows/Dockerfile-dev-image new file mode 100644 index 000000000..6764ef8c4 --- /dev/null +++ b/kubernetes/windows/Dockerfile-dev-image @@ -0,0 +1,45 @@ +FROM omsagent-win-base +MAINTAINER OMSContainers@microsoft.com +LABEL vendor=Microsoft\ Corp \ + com.microsoft.product="Azure Monitor for containers" + +#Uncomment below to test setup.ps1 changes +#COPY setup.ps1 /opt/omsagentwindows/scripts/powershell +#RUN ./setup.ps1 +COPY main.ps1 /opt/omsagentwindows/scripts/powershell +COPY ./omsagentwindows/installer/scripts/filesystemwatcher.ps1 /opt/omsagentwindows/scripts/powershell +COPY ./omsagentwindows/installer/scripts/livenessprobe.cmd /opt/omsagentwindows/scripts/cmd/ +COPY setdefaulttelegrafenvvariables.ps1 /opt/omsagentwindows/scripts/powershell + +# copy ruby scripts to /opt folder +COPY ./omsagentwindows/installer/scripts/*.rb /opt/omsagentwindows/scripts/ruby/ + +# copy out_oms.so file +COPY ./omsagentwindows/out_oms.so /opt/omsagentwindows/out_oms.so + +# copy fluent, fluent-bit and out_oms conf files +COPY ./omsagentwindows/installer/conf/fluent.conf /etc/fluent/ +# copy fluent docker and cri parser conf files +COPY ./omsagentwindows/installer/conf/fluent-cri-parser.conf /etc/fluent/ +COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ +COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit +COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows + +# copy telegraf conf file +COPY ./omsagentwindows/installer/conf/telegraf.conf /etc/telegraf/ + +# copy keepcert alive ruby scripts +COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/fluent/plugin/ + +#Copy fluentd ruby plugins +COPY ./omsagentwindows/ruby/ /etc/fluent/plugin/ +COPY ./omsagentwindows/utils/*.rb /etc/fluent/plugin/ + +ENV AGENT_VERSION ${IMAGE_TAG} +ENV OS_TYPE "windows" +ENV APPLICATIONINSIGHTS_AUTH "NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi" +ENV AZMON_COLLECT_ENV False +ENV CI_CERT_LOCATION "C://oms.crt" +ENV CI_KEY_LOCATION "C://oms.key" + +ENTRYPOINT ["powershell", "C:\\opt\\omsagentwindows\\scripts\\powershell\\main.ps1"] diff --git a/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 b/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 new file mode 100644 index 000000000..0fde7f379 --- /dev/null +++ b/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 @@ -0,0 +1,64 @@ +<# + .DESCRIPTION + Builds the Windows Agent code and Docker Image and pushes the docker image to specified repo + + .PARAMETER image + docker image. format should be /: +#> +param( + [Parameter(mandatory = $true)] + [string]$image +) + +$currentdir = $PSScriptRoot +Write-Host("current script dir : " + $currentdir + " ") + +if ($false -eq (Test-Path -Path $currentdir)) { + Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red + exit +} + +if ([string]::IsNullOrEmpty($image)) { + Write-Host "Image parameter shouldnt be null or empty" -ForegroundColor Red + exit +} + +$imageparts = $image.split(":") +if (($imageparts.Length -ne 2)){ + Write-Host "Image not in valid format. Expected format should be /:" -ForegroundColor Red + exit +} + +$imagetag = $imageparts[1].ToLower() +$imagerepo = $imageparts[0] + +if ($imagetag.StartsWith("win-") -eq $false) +{ + Write-Host "adding win- prefix image tag since its not provided" + $imagetag = "win-$imagetag" +} + +Write-Host "image tag used is :$imagetag" + +Write-Host "start:Building the cert generator and out oms code via Makefile.ps1" +..\..\..\build\windows\Makefile.ps1 +Write-Host "end:Building the cert generator and out oms code via Makefile.ps1" + +$dockerFileDir = Split-Path -Path $currentdir +Write-Host("builddir dir : " + $dockerFileDir + " ") +if ($false -eq (Test-Path -Path $dockerFileDir)) { + Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red + exit +} + +Write-Host "changing directory to DockerFile dir: $dockerFileDir" +Set-Location -Path $dockerFileDir + +$updateImage = ${imagerepo} + ":" + ${imageTag} +Write-Host "STAT:Triggering docker image build: $image" +docker build -t $updateImage --build-arg IMAGE_TAG=$imageTag -f Dockerfile-dev-image . +Write-Host "END:Triggering docker image build: $updateImage" + +Write-Host "STAT:pushing docker image : $updateImage" +docker push $updateImage +Write-Host "EnD:pushing docker image : $updateImage" diff --git a/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 b/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 new file mode 100644 index 000000000..142e20c3f --- /dev/null +++ b/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 @@ -0,0 +1,32 @@ +<# + .DESCRIPTION + Builds the Docker Image locally for the server core ltsc base and installs dependencies + +#> + +$currentdir = $PSScriptRoot +Write-Host("current script dir : " + $currentdir + " ") + +if ($false -eq (Test-Path -Path $currentdir)) { + Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red + exit +} + +Write-Host "start:Building the cert generator and out oms code via Makefile.ps1" +..\..\..\build\windows\Makefile.ps1 +Write-Host "end:Building the cert generator and out oms code via Makefile.ps1" + +$dockerFileDir = Split-Path -Path $currentdir +Write-Host("builddir dir : " + $dockerFileDir + " ") +if ($false -eq (Test-Path -Path $dockerFileDir)) { + Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red + exit +} + +Write-Host "changing directory to DockerFile dir: $dockerFileDir" +Set-Location -Path $dockerFileDir + +$updateImage = "omsagent-win-base" +Write-Host "STAT:Triggering base docker image build: $updateImage" +docker build -t $updateImage -f Dockerfile-dev-base-image . +Write-Host "END:Triggering docker image build: $updateImage" \ No newline at end of file From 68e90b63e1efd7f572b586da63f67096bfb07648 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 22 Jun 2021 22:06:35 -0700 Subject: [PATCH 118/194] fix windows build failure due to msys2 version --- kubernetes/windows/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 997b2f310..94be59644 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -10,7 +10,7 @@ ARG IMAGE_TAG=win-ciprod06112021 RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" # Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ -&& choco install -y msys2 --version 20200903.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y msys2 --version 20210604.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ && choco install -y vim # gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update From cf68a4fde5c7471bfa6679703d1d77d0f98745ea Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 23 Jun 2021 10:28:49 -0700 Subject: [PATCH 119/194] Fix telegraf startup issue when endpoint is unreachable (#587) --- build/linux/installer/conf/telegraf-prom-side-car.conf | 8 +++++++- build/linux/installer/conf/telegraf-rs.conf | 8 +++++++- build/linux/installer/conf/telegraf.conf | 8 ++++++-- build/linux/installer/conf/test.json | 1 + build/linux/installer/datafiles/base_container.data | 3 +++ kubernetes/linux/main.sh | 9 ++++++--- 6 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 build/linux/installer/conf/test.json diff --git a/build/linux/installer/conf/telegraf-prom-side-car.conf b/build/linux/installer/conf/telegraf-prom-side-car.conf index b3b4ba1d3..1b6bab9f9 100644 --- a/build/linux/installer/conf/telegraf-prom-side-car.conf +++ b/build/linux/installer/conf/telegraf-prom-side-car.conf @@ -109,7 +109,7 @@ ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "json" - namedrop = ["agent_telemetry"] + namedrop = ["agent_telemetry", "file"] ############################################################################### # PROCESSOR PLUGINS # @@ -119,6 +119,12 @@ [processors.converter.fields] float = ["*"] +# Dummy plugin to test out toml parsing happens properly +[[inputs.file]] + interval = "24h" + files = ["test.json"] + data_format = "json" + #Prometheus Custom Metrics [[inputs.prometheus]] interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index ee1cf8819..0ca07f7e5 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -121,7 +121,7 @@ ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "json" - namedrop = ["agent_telemetry"] + namedrop = ["agent_telemetry", "file"] #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] [[outputs.application_insights]] @@ -538,6 +538,12 @@ #tagexclude = ["AgentVersion","AKS_RESOURCE_ID","ACS_RESOURCE_NAME", "Region", "ClusterName", "ClusterType", "Computer", "ControllerType"] # [inputs.prometheus.tagpass] +# Dummy plugin to test out toml parsing happens properly +[[inputs.file]] + interval = "24h" + files = ["test.json"] + data_format = "json" + #Prometheus Custom Metrics [[inputs.prometheus]] interval = "$AZMON_TELEGRAF_CUSTOM_PROM_INTERVAL" diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 5a5bb2d8c..8b6e2ad4b 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -120,7 +120,7 @@ ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "json" - namedrop = ["agent_telemetry"] + namedrop = ["agent_telemetry", "file"] #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] # Output to send MDM metrics to fluent bit and then route it to fluentD @@ -425,7 +425,11 @@ # fieldpass = ["usage_active","cluster","node","host","device"] # taginclude = ["cluster","cpu","node"] - +# Dummy plugin to test out toml parsing happens properly +[[inputs.file]] + interval = "24h" + files = ["test.json"] + data_format = "json" # Read metrics about disk usage by mount point [[inputs.disk]] diff --git a/build/linux/installer/conf/test.json b/build/linux/installer/conf/test.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/build/linux/installer/conf/test.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 88c790be3..bdacf647d 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -36,6 +36,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/test.json; build/linux/installer/conf/test.json; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf; build/linux/installer/conf/telegraf-prom-side-car.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf-rs.conf; build/linux/installer/conf/telegraf-rs.conf; 644; root; root @@ -53,6 +54,8 @@ MAINTAINER: 'Microsoft Corporation' /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root /opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root /opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root +/opt/test.json; build/linux/installer/conf/test.json; 644; root; root + /etc/opt/microsoft/docker-cimprov/health/healthmonitorconfig.json; build/linux/installer/conf/healthmonitorconfig.json; 644; root; root diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index ec348bba3..1a7034d4d 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -504,17 +504,19 @@ fi if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ] && [ -e "/opt/telegraf-test-prom-side-car.conf" ]; then echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf -test + /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf --input-filter file -test if [ $? -eq 0 ]; then mv "/opt/telegraf-test-prom-side-car.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + echo "Moving test conf file to telegraf side-car conf since test run succeeded" fi echo "****************End Telegraf Run in Test Mode**************************" else if [ -e "/opt/telegraf-test.conf" ]; then echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test.conf -test + /opt/telegraf --config /opt/telegraf-test.conf --input-filter file -test if [ $? -eq 0 ]; then mv "/opt/telegraf-test.conf" "/etc/opt/microsoft/docker-cimprov/telegraf.conf" + echo "Moving test conf file to telegraf daemonset conf since test run succeeded" fi echo "****************End Telegraf Run in Test Mode**************************" fi @@ -522,9 +524,10 @@ if [ ! -e "/etc/config/kube.conf" ]; then else if [ -e "/opt/telegraf-test-rs.conf" ]; then echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-rs.conf -test + /opt/telegraf --config /opt/telegraf-test-rs.conf --input-filter file -test if [ $? -eq 0 ]; then mv "/opt/telegraf-test-rs.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-rs.conf" + echo "Moving test conf file to telegraf replicaset conf since test run succeeded" fi echo "****************End Telegraf Run in Test Mode**************************" fi From cd2275354aafb588f0ed74cf8d747e40226e4974 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 23 Jun 2021 13:17:56 -0700 Subject: [PATCH 120/194] revert fbit tail plugins defaults to std defaults (#586) --- .../installer/scripts/td-agent-bit-conf-customizer.rb | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb index 82c6c1d17..f29c87407 100644 --- a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb @@ -3,9 +3,7 @@ @td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" -@default_service_interval = "1" -@default_buffer_chunk_size = "1" -@default_buffer_max_size = "1" +@default_service_interval = "15" @default_mem_buf_limit = "10" def is_number?(value) @@ -25,9 +23,9 @@ def substituteFluentBitPlaceHolders serviceInterval = (!interval.nil? && is_number?(interval) && interval.to_i > 0 ) ? interval : @default_service_interval serviceIntervalSetting = "Flush " + serviceInterval - tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize) && bufferChunkSize.to_i > 0) ? bufferChunkSize : @default_buffer_chunk_size + tailBufferChunkSize = (!bufferChunkSize.nil? && is_number?(bufferChunkSize) && bufferChunkSize.to_i > 0) ? bufferChunkSize : nil - tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize) && bufferMaxSize.to_i > 0) ? bufferMaxSize : @default_buffer_max_size = "1" + tailBufferMaxSize = (!bufferMaxSize.nil? && is_number?(bufferMaxSize) && bufferMaxSize.to_i > 0) ? bufferMaxSize : nil if ((!tailBufferChunkSize.nil? && tailBufferMaxSize.nil?) || (!tailBufferChunkSize.nil? && !tailBufferMaxSize.nil? && tailBufferChunkSize.to_i > tailBufferMaxSize.to_i)) puts "config:warn buffer max size must be greater or equal to chunk size" From 8c41a42043a1cf7f5635e2521dfa7660d430dbeb Mon Sep 17 00:00:00 2001 From: David Michelman Date: Thu, 1 Jul 2021 13:40:41 -0700 Subject: [PATCH 121/194] fixed another bug (#593) --- source/plugins/ruby/in_kube_nodes.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index ffc11de55..ebfa903fd 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -559,10 +559,10 @@ def clean_cache() end end - nodes_to_remove.each do node_name + nodes_to_remove.each {|node_name| @cacheHash.delete(node_name) @timeAdded.delete(node_name) - end + } end end end # NodeCache From 00f1a0dbd0d835b085e4b1e0f55de22bc65545db Mon Sep 17 00:00:00 2001 From: bragi92 Date: Fri, 9 Jul 2021 16:46:41 +0000 Subject: [PATCH 122/194] feat: add new metrics to MDM for allocatable % calculation of cpu and memory usage (#584) * feat: allocatable cpu and memory % metrics for MDM * maybe * linux is working * windwos.... * some more * comment * better * syntax * ruby * revert omsagent.yaml * comments * pr feedback * pr feedback * testing msys2 version update * better --- kubernetes/windows/Dockerfile-dev-base-image | 2 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 5 + source/plugins/ruby/MdmMetricsGenerator.rb | 22 +++- source/plugins/ruby/constants.rb | 3 + source/plugins/ruby/filter_cadvisor2mdm.rb | 68 +++++++++-- source/plugins/ruby/kubelet_utils.rb | 108 ++++++++++++++++++ 6 files changed, 194 insertions(+), 14 deletions(-) diff --git a/kubernetes/windows/Dockerfile-dev-base-image b/kubernetes/windows/Dockerfile-dev-base-image index 9c6ae8db8..0081f9c53 100644 --- a/kubernetes/windows/Dockerfile-dev-base-image +++ b/kubernetes/windows/Dockerfile-dev-base-image @@ -8,7 +8,7 @@ LABEL vendor=Microsoft\ Corp \ RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" # Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ -&& choco install -y msys2 --version 20200903.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y msys2 --version 20210604.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ && choco install -y vim # gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 10720752d..da6e94f5f 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -81,6 +81,11 @@ def getSummaryStatsFromCAdvisor(winNode) return getResponse(winNode, relativeUri) end + def getCongifzCAdvisor(winNode: nil) + relativeUri = "/configz" + return getResponse(winNode, relativeUri) + end + def getAllMetricsCAdvisor(winNode: nil) relativeUri = "/metrics/cadvisor" return getResponse(winNode, relativeUri) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 73cf19fac..0858990da 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -37,6 +37,12 @@ class MdmMetricsGenerator Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE, } + @@node_metric_name_metric_allocatable_percentage_name_hash = { + Constants::CPU_USAGE_MILLI_CORES => Constants::MDM_NODE_CPU_USAGE_ALLOCATABLE_PERCENTAGE, + Constants::MEMORY_RSS_BYTES => Constants::MDM_NODE_MEMORY_RSS_ALLOCATABLE_PERCENTAGE, + Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_NODE_MEMORY_WORKING_SET_ALLOCATABLE_PERCENTAGE, + } + @@container_metric_name_metric_percentage_name_hash = { Constants::CPU_USAGE_MILLI_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, @@ -526,7 +532,7 @@ def getContainerResourceUtilizationThresholds return metric_threshold_hash end - def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value) + def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value, allocatable_percentage_metric_value) records = [] begin custommetricrecord = MdmAlertTemplates::Node_resource_metrics_template % { @@ -554,6 +560,20 @@ def getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_m } records.push(Yajl::Parser.parse(StringIO.new(additional_record))) end + + if !allocatable_percentage_metric_value.nil? + additional_record = MdmAlertTemplates::Node_resource_metrics_template % { + timestamp: record["Timestamp"], + metricName: @@node_metric_name_metric_allocatable_percentage_name_hash[metric_name], + hostvalue: record["Host"], + objectnamevalue: record["ObjectName"], + instancenamevalue: record["InstanceName"], + metricminvalue: allocatable_percentage_metric_value, + metricmaxvalue: allocatable_percentage_metric_value, + metricsumvalue: allocatable_percentage_metric_value, + } + records.push(Yajl::Parser.parse(StringIO.new(additional_record))) + end rescue => errorStr @log.info "Error in getNodeResourceMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index c037c99f6..c40d4c357 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -60,6 +60,9 @@ class Constants MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" + MDM_NODE_CPU_USAGE_ALLOCATABLE_PERCENTAGE = "cpuUsageAllocatablePercentage" + MDM_NODE_MEMORY_RSS_ALLOCATABLE_PERCENTAGE = "memoryRssAllocatablePercentage" + MDM_NODE_MEMORY_WORKING_SET_ALLOCATABLE_PERCENTAGE = "memoryWorkingSetAllocatablePercentage" CONTAINER_TERMINATED_RECENTLY_IN_MINUTES = 5 OBJECT_NAME_K8S_CONTAINER = "K8SContainer" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 9c6b661b0..6bafa372a 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -66,8 +66,10 @@ def start # initialize cpu and memory limit if @process_incoming_stream @cpu_capacity = 0.0 + @cpu_allocatable = 0.0 @memory_capacity = 0.0 - ensure_cpu_memory_capacity_set + @memory_allocatable = 0.0 + ensure_cpu_memory_capacity_and_allocatable_set @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @containerResourceDimensionHash = {} @@ -167,6 +169,7 @@ def filter(tag, time, record) counter_name = JSON.parse(record["json_Collections"])[0]["CounterName"] percentage_metric_value = 0.0 + allocatable_percentage_metric_value = 0.0 metric_value = JSON.parse(record["json_Collections"])[0]["Value"] if object_name == Constants::OBJECT_NAME_K8S_NODE && @metrics_to_collect_hash.key?(counter_name.downcase) @@ -176,39 +179,62 @@ def filter(tag, time, record) metric_value /= 1000000 #cadvisor record is in nanocores. Convert to mc if @@controller_type.downcase == "replicaset" target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["Host"]) / 1000000 + target_node_cpu_allocatable_mc = 0.0 # We do not need this value in the replicaset else target_node_cpu_capacity_mc = @cpu_capacity + target_node_cpu_allocatable_mc = @cpu_allocatable end - @log.info "Metric_value: #{metric_value} CPU Capacity #{target_node_cpu_capacity_mc}" + @log.info "Metric_value: #{metric_value} CPU Capacity #{target_node_cpu_capacity_mc} CPU Allocatable #{target_node_cpu_allocatable_mc} " if target_node_cpu_capacity_mc != 0.0 percentage_metric_value = (metric_value) * 100 / target_node_cpu_capacity_mc end + if target_node_cpu_allocatable_mc != 0.0 + allocatable_percentage_metric_value = (metric_value) * 100 / target_node_cpu_allocatable_mc + else + allocatable_percentage_metric_value = 0.0 + end end if counter_name.start_with?("memory") metric_name = counter_name if @@controller_type.downcase == "replicaset" target_node_mem_capacity = @NodeCache.mem.get_capacity(record["Host"]) + target_node_mem_allocatable = 0.0 # We do not need this value in the replicaset else target_node_mem_capacity = @memory_capacity + target_node_mem_allocatable = @memory_allocatable # We do not need this value in the replicaset end - @log.info "Metric_value: #{metric_value} Memory Capacity #{target_node_mem_capacity}" + + @log.info "Metric_value: #{metric_value} Memory Capacity #{target_node_mem_capacity} Memory Allocatable #{target_node_mem_allocatable}" if target_node_mem_capacity != 0.0 percentage_metric_value = metric_value * 100 / target_node_mem_capacity end + + if target_node_mem_allocatable != 0.0 + allocatable_percentage_metric_value = metric_value * 100 / target_node_mem_allocatable + else + allocatable_percentage_metric_value = 0.0 + end end - @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["Host"]} percentage: #{percentage_metric_value}" + @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["Host"]} percentage: #{percentage_metric_value} allocatable_percentage: #{allocatable_percentage_metric_value}" - # do some sanity checking. Do we want this? - if percentage_metric_value > 100.0 or percentage_metric_value < 0.0 + # do some sanity checking. + if percentage_metric_value > 100.0 telemetryProperties = {} telemetryProperties["Computer"] = record["Host"] telemetryProperties["MetricName"] = metric_name telemetryProperties["MetricPercentageValue"] = percentage_metric_value ApplicationInsightsUtility.sendCustomEvent("ErrorPercentageOutOfBounds", telemetryProperties) end + if allocatable_percentage_metric_value > 100.0 + telemetryProperties = {} + telemetryProperties["Computer"] = record["Host"] + telemetryProperties["MetricName"] = metric_name + telemetryProperties["MetricAllocatablePercentageValue"] = allocatable_percentage_metric_value + ApplicationInsightsUtility.sendCustomEvent("ErrorPercentageOutOfBounds", telemetryProperties) + end - return MdmMetricsGenerator.getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value) + return MdmMetricsGenerator.getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value, allocatable_percentage_metric_value) elsif object_name == Constants::OBJECT_NAME_K8S_CONTAINER && @metrics_to_collect_hash.key?(counter_name.downcase) instanceName = record["InstanceName"] metricName = counter_name @@ -304,13 +330,20 @@ def filterPVInsightsMetrics(record) end end - def ensure_cpu_memory_capacity_set - if @cpu_capacity != 0.0 && @memory_capacity != 0.0 - @log.info "CPU And Memory Capacity are already set" + def ensure_cpu_memory_capacity_and_allocatable_set + @@controller_type = ENV["CONTROLLER_TYPE"] + + if @cpu_capacity != 0.0 && @memory_capacity != 0.0 && @@controller_type.downcase == "replicaset" + @log.info "CPU And Memory Capacity are already set and their values are as follows @cpu_capacity : #{@cpu_capacity}, @memory_capacity: #{@memory_capacity}" + return + end + + if @@controller_type.downcase == "daemonset" && @cpu_capacity != 0.0 && @memory_capacity != 0.0 && @cpu_allocatable != 0.0 && @memory_allocatable != 0.0 + @log.info "CPU And Memory Capacity are already set and their values are as follows @cpu_capacity : #{@cpu_capacity}, @memory_capacity: #{@memory_capacity}" + @log.info "CPU And Memory Allocatable are already set and their values are as follows @cpu_allocatable : #{@cpu_allocatable}, @memory_allocatable: #{@memory_allocatable}" return end - @@controller_type = ENV["CONTROLLER_TYPE"] if @@controller_type.downcase == "replicaset" @log.info "ensure_cpu_memory_capacity_set @cpu_capacity #{@cpu_capacity} @memory_capacity #{@memory_capacity}" @@ -354,13 +387,24 @@ def ensure_cpu_memory_capacity_set # cpu_capacity and memory_capacity keep initialized value of 0.0 @log.error "Error getting capacity_from_kubelet: cpu_capacity and memory_capacity" end + + allocatable_from_kubelet = KubeletUtils.get_node_allocatable(@cpu_capacity, @memory_capacity) + + # Error handling in case /configz endpoint fails + if !allocatable_from_kubelet.nil? && allocatable_from_kubelet.length > 1 + @cpu_allocatable = allocatable_from_kubelet[0] + @memory_allocatable = allocatable_from_kubelet[1] + else + # cpu_allocatable and memory_allocatable keep initialized value of 0.0 + @log.error "Error getting allocatable_from_kubelet: cpu_allocatable and memory_allocatable" + end end end def filter_stream(tag, es) new_es = Fluent::MultiEventStream.new begin - ensure_cpu_memory_capacity_set + ensure_cpu_memory_capacity_and_allocatable_set # Getting container limits hash if @process_incoming_stream @containerCpuLimitHash, @containerMemoryLimitHash, @containerResourceDimensionHash = KubeletUtils.get_all_container_limits diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index 22bc87c0e..e31407b54 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -41,6 +41,114 @@ def get_node_capacity end end + def get_node_allocatable(cpu_capacity, memory_capacity) + begin + if cpu_capacity == 0.0 || memory_capacity == 0.0 + @log.error "kubelet_utils.rb::get_node_allocatble - cpu_capacity or memory_capacity values not set. Hence we cannot calculate allocatable values" + end + + cpu_allocatable = 1.0 + memory_allocatable = 1.0 + + allocatable_response = CAdvisorMetricsAPIClient.getCongifzCAdvisor(winNode: nil) + parsed_response = JSON.parse(allocatable_response.body) + + begin + kubereserved_cpu = parsed_response["kubeletconfig"]["kubeReserved"]["cpu"] + if kubereserved_cpu.nil? || kubereserved_cpu == "" + kubereserved_cpu = "0" + end + @log.info "get_node_allocatable::kubereserved_cpu #{kubereserved_cpu}" + rescue => errorStr + @log.error "Error in get_node_allocatable::kubereserved_cpu: #{errorStr}" + kubereserved_cpu = "0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + end + + begin + kubereserved_memory = parsed_response["kubeletconfig"]["kubeReserved"]["memory"] + if kubereserved_memory.nil? || kubereserved_memory == "" + kubereserved_memory = "0" + end + @log.info "get_node_allocatable::kubereserved_memory #{kubereserved_memory}" + rescue => errorStr + @log.error "Error in get_node_allocatable::kubereserved_memory: #{errorStr}" + kubereserved_memory = "0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + end + begin + systemReserved_cpu = parsed_response["kubeletconfig"]["systemReserved"]["cpu"] + if systemReserved_cpu.nil? || systemReserved_cpu == "" + systemReserved_cpu = "0" + end + @log.info "get_node_allocatable::systemReserved_cpu #{systemReserved_cpu}" + rescue => errorStr + # this will likely always reach this condition for AKS ~ only applicable for hyrid + MDM combination + @log.error "Error in get_node_allocatable::systemReserved_cpu: #{errorStr}" + systemReserved_cpu = "0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + end + + begin + explicitlyReserved_cpu = parsed_response["kubeletconfig"]["reservedCPUs"] + if explicitlyReserved_cpu.nil? || explicitlyReserved_cpu == "" + explicitlyReserved_cpu = "0" + end + @log.info "get_node_allocatable::explicitlyReserved_cpu #{explicitlyReserved_cpu}" + rescue => errorStr + # this will likely always reach this condition for AKS ~ only applicable for hyrid + MDM combination + @log.error "Error in get_node_allocatable::explicitlyReserved_cpu: #{errorStr}" + explicitlyReserved_cpu = "0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::explicitlyReserved_cpu: #{errorStr}") + end + + begin + systemReserved_memory = parsed_response["kubeletconfig"]["systemReserved"]["memory"] + if systemReserved_memory.nil? || systemReserved_memory == "" + systemReserved_memory = "0" + end + @log.info "get_node_allocatable::systemReserved_memory #{systemReserved_memory}" + rescue => errorStr + @log.error "Error in get_node_allocatable::systemReserved_memory: #{errorStr}" + systemReserved_memory = "0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + end + + begin + evictionHard_memory = parsed_response["kubeletconfig"]["evictionHard"]["memory.available"] + if evictionHard_memory.nil? || evictionHard_memory == "" + evictionHard_memory = "0" + end + @log.info "get_node_allocatable::evictionHard_memory #{evictionHard_memory}" + rescue => errorStr + @log.error "Error in get_node_allocatable::evictionHard_memory: #{errorStr}" + evictionHard_memory = "0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + end + + # do calculation in nanocore since that's what KubernetesApiClient.getMetricNumericValue expects + cpu_capacity_number = cpu_capacity.to_i * 1000.0 ** 2 + # subtract to get allocatable. Formula : Allocatable = Capacity - ( kube reserved + system reserved + eviction threshold ) + # https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable + if KubernetesApiClient.getMetricNumericValue("cpu", explicitlyReserved_cpu) > 0 + cpu_allocatable = cpu_capacity_number - KubernetesApiClient.getMetricNumericValue("cpu", explicitlyReserved_cpu) + else + cpu_allocatable = cpu_capacity_number - (KubernetesApiClient.getMetricNumericValue("cpu", kubereserved_cpu) + KubernetesApiClient.getMetricNumericValue("cpu", systemReserved_cpu)) + end + # convert back to units similar to what we get for capacity + cpu_allocatable = cpu_allocatable / (1000.0 ** 2) + @log.info "CPU Allocatable #{cpu_allocatable}" + + memory_allocatable = memory_capacity - (KubernetesApiClient.getMetricNumericValue("memory", kubereserved_memory) + KubernetesApiClient.getMetricNumericValue("memory", systemReserved_memory) + KubernetesApiClient.getMetricNumericValue("memory", evictionHard_memory)) + @log.info "Memory Allocatable #{memory_allocatable}" + + return [cpu_allocatable, memory_allocatable] + rescue => errorStr + @log.info "Error get_node_allocatable: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + def get_all_container_limits begin @log.info "in get_all_container_limits..." From e1f9978677fd37953aa4d5af24fb763010b0de05 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 12 Jul 2021 12:46:23 -0700 Subject: [PATCH 123/194] update adx sdk for perf issue (#601) --- source/plugins/go/src/go.mod | 2 +- source/plugins/go/src/go.sum | 6 ++++++ source/plugins/go/src/oms.go | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/source/plugins/go/src/go.mod b/source/plugins/go/src/go.mod index 5b5c735e5..c3e6c2044 100644 --- a/source/plugins/go/src/go.mod +++ b/source/plugins/go/src/go.mod @@ -4,7 +4,7 @@ go 1.14 require ( code.cloudfoundry.org/clock v1.0.1-0.20200131002207-86534f4ca3a5 // indirect - github.com/Azure/azure-kusto-go v0.1.4-0.20200427191510-041d4ed55f86 + github.com/Azure/azure-kusto-go v0.3.2 github.com/Azure/go-autorest/autorest/azure/auth v0.4.2 github.com/fluent/fluent-bit-go v0.0.0-20171103221316-c4a158a6e3a7 github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680 // indirect diff --git a/source/plugins/go/src/go.sum b/source/plugins/go/src/go.sum index 64745749f..7e8b3d765 100644 --- a/source/plugins/go/src/go.sum +++ b/source/plugins/go/src/go.sum @@ -5,9 +5,13 @@ github.com/Azure/azure-kusto-go v0.1.3 h1:0u+YqfIvwj5PHd+moXwtlxVePt8xTLU1ixM8Q6 github.com/Azure/azure-kusto-go v0.1.3/go.mod h1:55hwXJ3PaahmWZFP7VC4+PlgsSUuetSA30rFtYFabfc= github.com/Azure/azure-kusto-go v0.1.4-0.20200427191510-041d4ed55f86 h1:vyhCediIKg1gZ9H/kMcutU8F8BFNhxLk76Gti8UAOzo= github.com/Azure/azure-kusto-go v0.1.4-0.20200427191510-041d4ed55f86/go.mod h1:55hwXJ3PaahmWZFP7VC4+PlgsSUuetSA30rFtYFabfc= +github.com/Azure/azure-kusto-go v0.3.2 h1:XpS9co6GvEDl2oICF9HsjEsQVwEpRK6wbNWb9Z+uqsY= +github.com/Azure/azure-kusto-go v0.3.2/go.mod h1:wd50n4qlsSxh+G4f80t+Fnl2ShK9AcXD+lMOstiKuYo= github.com/Azure/azure-pipeline-go v0.1.8/go.mod h1:XA1kFWRVhSK+KNFiOhfv83Fv8L9achrP7OxIzeTn1Yg= github.com/Azure/azure-pipeline-go v0.2.1 h1:OLBdZJ3yvOn2MezlWvbrBMTEUQC72zAftRZOMdj5HYo= github.com/Azure/azure-pipeline-go v0.2.1/go.mod h1:UGSo8XybXnIGZ3epmeBw7Jdz+HiUVpqIlpz/HKHylF4= +github.com/Azure/azure-sdk-for-go v44.1.0+incompatible h1:l1UGvaaoMCUwVGUauvHzeB4t+Y0yPX5iJwBhzc0LqyE= +github.com/Azure/azure-sdk-for-go v44.1.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-storage-blob-go v0.8.0 h1:53qhf0Oxa0nOjgbDeeYPUeyiNmafAFEY95rZLK0Tj6o= github.com/Azure/azure-storage-blob-go v0.8.0/go.mod h1:lPI3aLPpuLTeUwh1sViKXFxwl2B6teiRqI0deQUvsw0= github.com/Azure/azure-storage-queue-go v0.0.0-20191125232315-636801874cdd h1:b3wyxBl3vvr15tUAziPBPK354y+LSdfPCpex5oBttHo= @@ -73,6 +77,7 @@ github.com/json-iterator/go v0.0.0-20180612202835-f2b4162afba3/go.mod h1:+SdeFBv github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mattn/go-ieproxy v0.0.0-20190610004146-91bb50d98149 h1:HfxbT6/JcvIljmERptWhwa8XzP7H3T+Z2N26gTsaDaA= github.com/mattn/go-ieproxy v0.0.0-20190610004146-91bb50d98149/go.mod h1:31jz6HNzdxOmlERGGEc4v/dMssOfmp2p5bT/okiKFFc= @@ -116,6 +121,7 @@ golang.org/x/net v0.0.0-20170809000501-1c05540f6879 h1:0rFa7EaCGdQPmZVbo9F7MNF65 golang.org/x/net v0.0.0-20170809000501-1c05540f6879/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd h1:QPwSajcTUrFriMF1nJ3XzgoqakqQEsnZf9LdXdi2nkI= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 25f364c55..217ba1efc 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1285,7 +1285,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { //ADXFlushMutex.Lock() //defer ADXFlushMutex.Unlock() //MultiJSON support is not there yet - if ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogV2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { + if _, ingestionErr := ADXIngestor.FromReader(ctx, r, ingest.IngestionMappingRef("ContainerLogV2Mapping", ingest.JSON), ingest.FileFormat(ingest.JSON)); ingestionErr != nil { Log("Error when streaming to ADX Ingestion: %s", ingestionErr.Error()) //ADXIngestor = nil //not required as per ADX team. Will keep it to indicate that we tried this approach From c9ade1ba51672292a052b5697664781de7bed2c0 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 12 Jul 2021 20:10:03 -0700 Subject: [PATCH 124/194] remove md check --- .github/workflows/pr-checker.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pr-checker.yml b/.github/workflows/pr-checker.yml index c75e6dc24..f3bdb27e8 100644 --- a/.github/workflows/pr-checker.yml +++ b/.github/workflows/pr-checker.yml @@ -5,8 +5,6 @@ on: branches: - ci_dev - ci_prod - paths-ignore: - - '**.md' jobs: LINUX-build-and-scan: runs-on: ubuntu-latest From 6e2732e2896e72efbd3948e217b34ee5bbc2aff0 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 13 Jul 2021 06:48:55 -0700 Subject: [PATCH 125/194] Gangams/release notes update for hotfix (#596) * release notes updates * release notes updates for ciprod06112021-1 --- ReleaseNotes.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 266dadf1c..423161236 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,21 +11,28 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 07/02/2021 - +##### Version microsoft/oms:ciprod06112021-1 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021-1 (linux) +##### Version microsoft/oms:win-ciprod06112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021 (windows) +##### Code change log +- Hotfix for crash in clean_cache in in_kube_node_inventory plugin +- We didn't rebuild windows container, so the image version for windows container stays the same as last release (ciprod:win-ciprod06112021) before this hotfix + ### 06/11/2021 - ##### Version microsoft/oms:ciprod06112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021 (linux) ##### Version microsoft/oms:win-ciprod06112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021 (windows) - Linux Agent - - Removal of base omsagent dependency + - Removal of base omsagent dependency - Using MDSD version 1.10.1 as base agent for all the supported LA data types - Ruby version upgrade to 2.6 i.e. same version as windows agent - - Upgrade FluentD gem version to 1.12.2 + - Upgrade FluentD gem version to 1.12.2 - All the Ruby Fluentd Plugins upgraded to v1 as per Fluentd guidance - Fluent-bit tail plugin Mem_Buf_limit is configurable via ConfigMap - Windows Agent - CA cert changes for airgapped clouds - Send perf metrics to MDM from windows daemonset - FluentD gem version upgrade from 1.10.2 to 1.12.2 to make same version as Linux Agent - - Doc updates + - Doc updates - README updates related to OSM preview release for Arc K8s - README updates related to recommended alerts @@ -63,7 +70,7 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ##### Version microsoft/oms:ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021 (linux) ##### Version microsoft/oms:win-ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021 (windows) ##### Code change log -- Started collecting new metric - kubelet running pods count +- Started collecting new metric - kubelet running pods count - Onboarding script fixes to add explicit json output - Proxy and token updates for ARC - Doc updates for Microsoft charts repo release @@ -94,13 +101,13 @@ Note : The agent version(s) below has dates (ciprod), which indicate t ##### Version microsoft/oms:win-ciprod01112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01112021 (windows) ##### Code change log - Fixes for Linux Agent Replicaset Pod OOMing issue -- Update fluentbit (1.14.2 to 1.6.8) for the Linux Daemonset +- Update fluentbit (1.14.2 to 1.6.8) for the Linux Daemonset - Make Fluentbit settings: log_flush_interval_secs, tail_buf_chunksize_megabytes and tail_buf_maxsize_megabytes configurable via configmap - Support for PV inventory collection - Removal of Custom metric region check for Public cloud regions and update to use cloud environment variable to determine the custom metric support - For daemonset pods, add the dnsconfig to use ndots: 3 from ndots:5 to optimize the number of DNS API calls made - Fix for inconsistency in the collection container environment variables for the pods which has high number of containers -- Fix for disabling of std{out;err} log_collection_settings via configmap issue in windows daemonset +- Fix for disabling of std{out;err} log_collection_settings via configmap issue in windows daemonset - Update to use workspace key from mount file rather than environment variable for windows daemonset agent - Remove per container info logs in the container inventory - Enable ADX route for windows container logs From 6df299f9658c8397ea48948b3c614de629acefb2 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 14 Jul 2021 16:39:13 -0700 Subject: [PATCH 126/194] Cherry picking hotfix changes to ci_dev (#605) --- kubernetes/windows/Dockerfile | 2 +- kubernetes/windows/main.ps1 | 55 ++++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 94be59644..0ba64cd75 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod06112021 +ARG IMAGE_TAG=win-ciprod06112021-2 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index bc053b0d6..1bb9a3468 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -140,7 +140,7 @@ function Set-EnvironmentVariables { if ($aiKeyURl) { $aiKeyFetched = "" # retry up to 5 times - for( $i = 1; $i -le 4; $i++) { + for ( $i = 1; $i -le 4; $i++) { try { $response = Invoke-WebRequest -uri $aiKeyURl -UseBasicParsing -TimeoutSec 5 -ErrorAction:Stop @@ -229,6 +229,24 @@ function Set-EnvironmentVariables { Write-Host "Failed to set environment variable HOSTNAME for target 'machine' since it is either null or empty" } + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") + if (![string]::IsNullOrEmpty($nodeIp)) { + [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") + Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" + } + + $agentVersion = [System.Environment]::GetEnvironmentVariable("AGENT_VERSION", "process") + if (![string]::IsNullOrEmpty($agentVersion)) { + [System.Environment]::SetEnvironmentVariable("AGENT_VERSION", $agentVersion, "machine") + Write-Host "Successfully set environment variable AGENT_VERSION - $($agentVersion) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable AGENT_VERSION for target 'machine' since it is either null or empty" + } + # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb .\setenv.ps1 @@ -364,13 +382,12 @@ function Start-Fluent-Telegraf { if (![string]::IsNullOrEmpty($containerRuntime) -and [string]$containerRuntime.StartsWith('docker') -eq $false) { # change parser from docker to cri if the container runtime is not docker Write-Host "changing parser from Docker to CRI since container runtime : $($containerRuntime) and which is non-docker" - (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf','fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf + (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf', 'fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf } # Start telegraf only in sidecar scraping mode $sidecarScrapingEnabled = [System.Environment]::GetEnvironmentVariable('SIDECAR_SCRAPING_ENABLED') - if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') - { + if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') { Write-Host "Starting telegraf..." Start-Telegraf } @@ -411,15 +428,6 @@ function Start-Telegraf { Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" } - $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") - if (![string]::IsNullOrEmpty($nodeIp)) { - [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") - Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." - } - else { - Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" - } - Write-Host "Installing telegraf service" C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" @@ -432,14 +440,15 @@ function Start-Telegraf { sc.exe \\$serverName config telegraf start= delayed-auto Write-Host "Successfully set delayed start for telegraf" - } else { + } + else { Write-Host "Failed to get environment variable PODNAME to set delayed telegraf start" } } catch { - $e = $_.Exception - Write-Host $e - Write-Host "exception occured in delayed telegraf start.. continuing without exiting" + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in delayed telegraf start.. continuing without exiting" } Write-Host "Running telegraf service in test mode" C:\opt\telegraf\telegraf.exe --config "C:\etc\telegraf\telegraf.conf" --test @@ -448,8 +457,7 @@ function Start-Telegraf { # Trying to start telegraf again if it did not start due to fluent bit not being ready at startup Get-Service telegraf | findstr Running - if ($? -eq $false) - { + if ($? -eq $false) { Write-Host "trying to start telegraf in again in 30 seconds, since fluentbit might not have been ready..." Start-Sleep -s 30 C:\opt\telegraf\telegraf.exe --service start @@ -488,7 +496,7 @@ function Bootstrap-CACertificates { $certMountPath = "C:\ca" Get-ChildItem $certMountPath | Foreach-Object { - $absolutePath=$_.FullName + $absolutePath = $_.FullName Write-Host "cert path: $($absolutePath)" Import-Certificate -FilePath $absolutePath -CertStoreLocation 'Cert:\LocalMachine\Root' -Verbose } @@ -510,10 +518,9 @@ Start-FileSystemWatcher $aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID") $requiresCertBootstrap = [System.Environment]::GetEnvironmentVariable("REQUIRES_CERT_BOOTSTRAP") if (![string]::IsNullOrEmpty($requiresCertBootstrap) -and ` - $requiresCertBootstrap.ToLower() -eq 'true' -and ` - ![string]::IsNullOrEmpty($aksResourceId) -and ` - $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) -{ + $requiresCertBootstrap.ToLower() -eq 'true' -and ` + ![string]::IsNullOrEmpty($aksResourceId) -and ` + $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) { Bootstrap-CACertificates } From 3b3833745a12b5c793dfc66186bc54db76aa77d7 Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Thu, 15 Jul 2021 12:25:25 -0700 Subject: [PATCH 127/194] release changes (#607) --- ReleaseNotes.md | 5 +++++ kubernetes/omsagent.yaml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 423161236..0c51b737c 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,11 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 07/13/2021 - +##### Version microsoft/oms:win-ciprod06112021-2 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021-2 (windows) +##### Code change log +- Hotfix for fixing NODE_IP environment variable not set issue for non sidecar mode + ### 07/02/2021 - ##### Version microsoft/oms:ciprod06112021-1 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021-1 (linux) ##### Version microsoft/oms:win-ciprod06112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021 (windows) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 617c81f38..855f3a8e1 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -760,7 +760,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021-2" imagePullPolicy: IfNotPresent resources: limits: From bcea7fcfbc8c68bed62912c772d21bf2823a23e5 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 19 Jul 2021 14:43:57 -0700 Subject: [PATCH 128/194] Gangams/aad stage3 msi auth (#585) * changes related to aad msi auth feature * use existing envvars * fix imds token expiry interval * refactor the windows agent ingestion token code * code cleanup * fix build errors * code clean up * code clean up * code clean up * code clean up * more refactoring * fix bug * fix bug * add debug logs * add nil checks * revert changes * revert yaml change since this added in aks side * fix pr feedback * fix pr feedback * refine retry code * update mdsd env as per official build * cleanup * update env vars per mdsd * update with mdsd official build * skip cert gen & renewal incase of aad msi auth * add nil check * cherry windows agent nodeip issue * fix merge issue Co-authored-by: rashmichandrashekar --- .../installer/datafiles/base_container.data | 3 + .../in_heartbeat_request.rb | 20 +- kubernetes/linux/main.sh | 112 ++-- kubernetes/linux/setup.sh | 4 +- kubernetes/windows/main.ps1 | 72 ++- .../ci-extension-dcr-streams.md | 186 +++++++ scripts/dcr-onboarding/ci-extension-dcr.json | 59 ++ source/plugins/go/src/extension/extension.go | 101 ++++ source/plugins/go/src/extension/interfaces.go | 34 ++ .../plugins/go/src/extension/socket_writer.go | 85 +++ source/plugins/go/src/go.mod | 2 +- .../plugins/go/src/ingestion_token_utils.go | 516 ++++++++++++++++++ source/plugins/go/src/oms.go | 91 ++- source/plugins/go/src/utils.go | 43 +- .../ruby/ApplicationInsightsUtility.rb | 9 +- source/plugins/ruby/CustomMetricsUtils.rb | 4 +- source/plugins/ruby/constants.rb | 23 + .../ruby/filter_health_model_builder.rb | 26 +- source/plugins/ruby/in_cadvisor_perf.rb | 26 +- source/plugins/ruby/in_containerinventory.rb | 38 +- source/plugins/ruby/in_kube_events.rb | 22 +- source/plugins/ruby/in_kube_nodes.rb | 58 +- source/plugins/ruby/in_kube_podinventory.rb | 54 +- source/plugins/ruby/in_kube_pvinventory.rb | 25 +- .../plugins/ruby/in_kubestate_deployments.rb | 21 +- source/plugins/ruby/in_kubestate_hpa.rb | 18 +- source/plugins/ruby/in_win_cadvisor_perf.rb | 12 + source/plugins/ruby/out_mdm.rb | 27 +- source/plugins/utils/extension.rb | 77 +++ source/plugins/utils/extension_utils.rb | 27 + 30 files changed, 1612 insertions(+), 183 deletions(-) create mode 100644 scripts/dcr-onboarding/ci-extension-dcr-streams.md create mode 100644 scripts/dcr-onboarding/ci-extension-dcr.json create mode 100644 source/plugins/go/src/extension/extension.go create mode 100644 source/plugins/go/src/extension/interfaces.go create mode 100644 source/plugins/go/src/extension/socket_writer.go create mode 100644 source/plugins/go/src/ingestion_token_utils.go create mode 100644 source/plugins/utils/extension.rb create mode 100644 source/plugins/utils/extension_utils.rb diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index bdacf647d..b71cafd49 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -150,6 +150,9 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/omslog.rb; source/plugins/utils/omslog.rb; 644; root; root /etc/fluent/plugin/oms_common.rb; source/plugins/utils/oms_common.rb; 644; root; root +/etc/fluent/plugin/extension.rb; source/plugins/utils/extension.rb; 644; root; root +/etc/fluent/plugin/extension_utils.rb; source/plugins/utils/extension_utils.rb; 644; root; root + /etc/fluent/kube.conf; build/linux/installer/conf/kube.conf; 644; root; root /etc/fluent/container.conf; build/linux/installer/conf/container.conf; 644; root; root diff --git a/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb b/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb index e255c4a71..e525d8681 100644 --- a/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb +++ b/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb @@ -36,14 +36,18 @@ def start def enumerate begin - puts "Calling certificate renewal code..." - maintenance = OMS::OnboardingHelper.new( - ENV["WSID"], - ENV["DOMAIN"], - ENV["CI_AGENT_GUID"] - ) - ret_code = maintenance.register_certs() - puts "Return code from register certs : #{ret_code}" + if !ENV["AAD_MSI_AUTH_MODE"].nil? && !ENV["AAD_MSI_AUTH_MODE"].empty? && ENV["AAD_MSI_AUTH_MODE"].downcase == "true" + puts "skipping certificate renewal code since AAD MSI auth configured" + else + puts "Calling certificate renewal code..." + maintenance = OMS::OnboardingHelper.new( + ENV["WSID"], + ENV["DOMAIN"], + ENV["CI_AGENT_GUID"] + ) + ret_code = maintenance.register_certs() + puts "Return code from register certs : #{ret_code}" + end rescue => errorStr puts "in_heartbeat_request::enumerate:Failed in enumerate: #{errorStr}" # STDOUT telemetry should alredy be going to Traces in AI. diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 1a7034d4d..428e6f35a 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -12,7 +12,7 @@ waitforlisteneronTCPport() { echo "${FUNCNAME[0]} called with incorrect arguments<$1 , $2>. Required arguments <#port, #wait-time-in-seconds>" return -1 else - + if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then #local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"') while true @@ -57,7 +57,11 @@ else export customResourceId=$AKS_RESOURCE_ID echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc source ~/.bashrc - echo "customResourceId:$customResourceId" + echo "customResourceId:$customResourceId" + export customRegion=$AKS_REGION + echo "export customRegion=$AKS_REGION" >> ~/.bashrc + source ~/.bashrc + echo "customRegion:$customRegion" fi #set agent config schema version @@ -194,9 +198,15 @@ fi if [ -z $domain ]; then ClOUD_ENVIRONMENT="unknown" elif [ $domain == "opinsights.azure.com" ]; then - CLOUD_ENVIRONMENT="public" -else - CLOUD_ENVIRONMENT="national" + CLOUD_ENVIRONMENT="azurepubliccloud" +elif [ $domain == "opinsights.azure.cn" ]; then + CLOUD_ENVIRONMENT="azurechinacloud" +elif [ $domain == "opinsights.azure.us" ]; then + CLOUD_ENVIRONMENT="azureusgovernmentcloud" +elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then + CLOUD_ENVIRONMENT="usnat" +elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then + CLOUD_ENVIRONMENT="ussec" fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc @@ -233,9 +243,9 @@ if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSI fi -aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc +aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) +export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc source ~/.bashrc @@ -421,7 +431,7 @@ export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_error if [ "$CONTAINER_RUNTIME" != "docker" ]; then # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" - export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" + export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" fi echo "set caps for ruby process to read container env from proc" @@ -445,34 +455,56 @@ DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc -echo "*** activating oneagent in legacy auth mode ***" -CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" -#use the file path as its secure than env -CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" -cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc -done -source /etc/mdsd.d/envmdsd -echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" -export CIWORKSPACE_id=$CIWORKSPACE_id -echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc -export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile -echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc -export OMS_TLD=$domain -echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc -export MDSD_FLUENT_SOCKET_PORT="29230" -echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc -#skip imds lookup since not used in legacy auth path +#skip imds lookup since not used either legacy or aad msi auth path export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc - +# this used by mdsd to determine cloud specific LA endpoints +export OMS_TLD=$domain +echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc +cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc +done +source /etc/mdsd.d/envmdsd +MDSD_AAD_MSI_AUTH_ARGS="" +# check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH +export AAD_MSI_AUTH_MODE=false +if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then + echo "*** activating oneagent in aad auth msi mode ***" + # msi auth specific args + MDSD_AAD_MSI_AUTH_ARGS="-a -A" + export AAD_MSI_AUTH_MODE=true + echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc + # this used by mdsd to determine the cloud specific AMCS endpoints + export customEnvironment=$CLOUD_ENVIRONMENT + echo "export customEnvironment=$customEnvironment" >> ~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="28230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + export ENABLE_MCS="true" + echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc + export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" + echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc + export MDSD_USE_LOCAL_PERSISTENCY="false" + echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc +else + echo "*** activating oneagent in legacy auth mode ***" + CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" + #use the file path as its secure than env + CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + export CIWORKSPACE_id=$CIWORKSPACE_id + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc + export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile + echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="29230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc +fi source ~/.bashrc dpkg -l | grep mdsd | awk '{print $2 " " $3}' -if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." +if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." #use tenant name to avoid unix socket conflict and different ports for port conflict #roleprefix to use container specific mdsd socket export TENANT_NAME="${CONTAINER_TYPE}" @@ -482,23 +514,23 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then source ~/.bashrc mkdir /var/run/mdsd-${CONTAINER_TYPE} # add -T 0xFFFF for full traces - mdsd -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & -else - echo "starting mdsd in legacy auth mode in main container..." - # add -T 0xFFFF for full traces - mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & +else + echo "starting mdsd mode in main container..." + # add -T 0xFFFF for full traces + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & fi -# no dependency on fluentd for prometheus side car container -if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then +# no dependency on fluentd for prometheus side car container +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -e "/etc/config/kube.conf" ]; then echo "*** starting fluentd v1 in daemonset" fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else echo "*** starting fluentd v1 in replicaset" fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & - fi -fi + fi +fi #If config parsing was successful, a copy of the conf file with replaced custom settings file is created if [ ! -e "/etc/config/kube.conf" ]; then @@ -635,7 +667,7 @@ echo "getting rsyslog status..." service rsyslog status shutdown() { - pkill -f mdsd + pkill -f mdsd } trap "shutdown" SIGTERM diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index ad7cc2232..933c14aed 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -9,8 +9,8 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -#install oneagent - Official bits (05/17/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/05172021-oneagent/azure-mdsd_1.10.1-build.master.213_x86_64.deb +#install oneagent - Official bits (06/24/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/06242021-oneagent/azure-mdsd_1.10.3-build.master.241_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 1bb9a3468..3cbc11e20 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -43,17 +43,49 @@ function Start-FileSystemWatcher { function Set-EnvironmentVariables { $domain = "opinsights.azure.com" - $cloud_environment = "public" + $mcs_endpoint = "monitor.azure.com" + $cloud_environment = "azurepubliccloud" if (Test-Path /etc/omsagent-secret/DOMAIN) { # TODO: Change to omsagent-secret before merging $domain = Get-Content /etc/omsagent-secret/DOMAIN - $cloud_environment = "national" + if (![string]::IsNullOrEmpty($domain)) { + if ($domain -eq "opinsights.azure.com") { + $cloud_environment = "azurepubliccloud" + $mcs_endpoint = "monitor.azure.com" + } elseif ($domain -eq "opinsights.azure.cn") { + $cloud_environment = "azurechinacloud" + $mcs_endpoint = "monitor.azure.cn" + } elseif ($domain -eq "opinsights.azure.us") { + $cloud_environment = "azureusgovernmentcloud" + $mcs_endpoint = "monitor.azure.us" + } elseif ($domain -eq "opinsights.azure.eaglex.ic.gov") { + $cloud_environment = "usnat" + $mcs_endpoint = "monitor.azure.eaglex.ic.gov" + } elseif ($domain -eq "opinsights.azure.microsoft.scloud") { + $cloud_environment = "ussec" + $mcs_endpoint = "monitor.azure.microsoft.scloud" + } else { + Write-Host "Invalid or Unsupported domain name $($domain). EXITING....." + exit 1 + } + } else { + Write-Host "Domain name either null or empty. EXITING....." + exit 1 + } } + Write-Host "Log analytics domain: $($domain)" + Write-Host "MCS endpoint: $($mcs_endpoint)" + Write-Host "Cloud Environment: $($cloud_environment)" + # Set DOMAIN [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Process") [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Machine") + # Set MCS Endpoint + [System.Environment]::SetEnvironmentVariable("MCS_ENDPOINT", $mcs_endpoint, "Process") + [System.Environment]::SetEnvironmentVariable("MCS_ENDPOINT", $mcs_endpoint, "Machine") + # Set CLOUD_ENVIRONMENT [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Process") [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") @@ -158,7 +190,7 @@ function Set-EnvironmentVariables { Write-Host $_.Exception } } - + # Check if the fetched IKey was properly encoded. if not then turn off telemetry if ($aiKeyFetched -match '^[A-Za-z0-9=]+$') { Write-Host "Using cloud-specific instrumentation key" @@ -229,6 +261,21 @@ function Set-EnvironmentVariables { Write-Host "Failed to set environment variable HOSTNAME for target 'machine' since it is either null or empty" } + # check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH environment variable + $isAADMSIAuth = [System.Environment]::GetEnvironmentVariable("USING_AAD_MSI_AUTH", "process") + if (![string]::IsNullOrEmpty($isAADMSIAuth)) { + [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", $isAADMSIAuth, "Process") + [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", $isAADMSIAuth, "Machine") + Write-Host "Successfully set environment variable AAD_MSI_AUTH_MODE - $($isAADMSIAuth) for target 'machine'..." + } + + # check if use token proxy endpoint set via USE_IMDS_TOKEN_PROXY_END_POINT environment variable + $useIMDSTokenProxyEndpoint = [System.Environment]::GetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", "process") + if (![string]::IsNullOrEmpty($useIMDSTokenProxyEndpoint)) { + [System.Environment]::SetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", $useIMDSTokenProxyEndpoint, "Process") + [System.Environment]::SetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", $useIMDSTokenProxyEndpoint, "Machine") + Write-Host "Successfully set environment variable USE_IMDS_TOKEN_PROXY_END_POINT - $($useIMDSTokenProxyEndpoint) for target 'machine'..." + } $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") if (![string]::IsNullOrEmpty($nodeIp)) { [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") @@ -427,7 +474,15 @@ function Start-Telegraf { else { Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" } - + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") + if (![string]::IsNullOrEmpty($nodeIp)) { + [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") + Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" + } + Write-Host "Installing telegraf service" C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" @@ -524,8 +579,13 @@ if (![string]::IsNullOrEmpty($requiresCertBootstrap) -and ` Bootstrap-CACertificates } -Generate-Certificates -Test-CertificatePath +$isAADMSIAuth = [System.Environment]::GetEnvironmentVariable("USING_AAD_MSI_AUTH") +if (![string]::IsNullOrEmpty($isAADMSIAuth) -and $isAADMSIAuth.ToLower() -eq 'true') { + Write-Host "skipping agent onboarding via cert since AAD MSI Auth configured" +} else { + Generate-Certificates + Test-CertificatePath +} Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 diff --git a/scripts/dcr-onboarding/ci-extension-dcr-streams.md b/scripts/dcr-onboarding/ci-extension-dcr-streams.md new file mode 100644 index 000000000..cbac41838 --- /dev/null +++ b/scripts/dcr-onboarding/ci-extension-dcr-streams.md @@ -0,0 +1,186 @@ +# 1 - ContainerLogV2 +> Note- Please note, this table uses NG schema +``` +stream-id: Microsoft-ContainerLogV2 +data-type: CONTAINERINSIGHTS_CONTAINERLOGV2 +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: ContainerLogV2 +alias-stream-id: Microsoft-ContainerLogV2 +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 2 - InsightsMetrics +``` +stream-id: Microsoft-InsightsMetrics +data-type: INSIGHTS_METRICS_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: InsightsMetrics +alias-stream-id: Microsoft-InsightsMetrics +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 3 - ContainerInventory + +``` +stream-id: Microsoft-ContainerInventory +data-type: CONTAINER_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: ContainerInventory +alias-stream-id: Microsoft-ContainerInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 4 - ContainerLog + +``` +stream-id: Microsoft-ContainerLog +data-type: CONTAINER_LOG_BLOB +intelligence-pack: Containers +solutions: ContainerInsights +platform: Any +la-table-name: ContainerLog +alias-stream-id: Microsoft-ContainerLog +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 5 - ContainerNodeInventory + +``` +stream-id: Microsoft-ContainerNodeInventory +data-type: CONTAINER_NODE_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: ContainerNodeInventory +alias-stream-id: Microsoft-ContainerNodeInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 6 - KubePodInventory +``` +stream-id: Microsoft-KubePodInventory +data-type: KUBE_POD_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubePodInventory +alias-stream-id: Microsoft-KubePodInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 7 - KubeNodeInventory +``` +stream-id: Microsoft-KubeNodeInventory +data-type: KUBE_NODE_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeNodeInventory +alias-stream-id: Microsoft-KubeNodeInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 8 - KubePVInventory +``` +stream-id: Microsoft-KubePVInventory +data-type: KUBE_PV_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubePVInventory +alias-stream-id: Microsoft-KubePVInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 9 - KubeEvents +``` +stream-id: Microsoft-KubeEvents +data-type: KUBE_EVENTS_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeEvents +alias-stream-id: Microsoft-KubeEvents +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 10 - KubeServices +``` +stream-id: Microsoft-KubeServices +data-type: KUBE_SERVICES_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeServices +alias-stream-id: Microsoft-KubeServices +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 11 - KubeMonAgentEvents +``` +stream-id: Microsoft-KubeMonAgentEvents +data-type: KUBE_MON_AGENT_EVENTS_BLOB +intelligence-pack: Containers +solutions: ContainerInsights +platform: Any +la-table-name: KubeMonAgentEvents +alias-stream-id: Microsoft-KubeMonAgentEvents +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 12 - KubeHealth +``` +stream-id: Microsoft-KubeHealth +data-type: KUBE_HEALTH_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeHealth +alias-stream-id: Microsoft-KubeHealth +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 13 - Perf +``` +> Note - This stream already exists +stream-id: Microsoft-Perf +data-type: LINUX_PERF_BLOB +intelligence-pack: LogManagement +solutions: ContainerInsights +platform: Any +la-table-name: LogManagement +alias-stream-id: Microsoft-Perf +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` diff --git a/scripts/dcr-onboarding/ci-extension-dcr.json b/scripts/dcr-onboarding/ci-extension-dcr.json new file mode 100644 index 000000000..f3fbec79b --- /dev/null +++ b/scripts/dcr-onboarding/ci-extension-dcr.json @@ -0,0 +1,59 @@ +{ + "location": "", + "properties": { + "dataSources": { + "extensions": [ + { + "name": "ContainerInsightsExtension", + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + + ], + "extensionName": "ContainerInsights" + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "workspaceResourceId": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/", + "name": "ciworkspace" + } + ] + }, + "dataFlows": [ + { + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "destinations": [ + "ciworkspace" + ] + } + ] + } +} diff --git a/source/plugins/go/src/extension/extension.go b/source/plugins/go/src/extension/extension.go new file mode 100644 index 000000000..c68140ded --- /dev/null +++ b/source/plugins/go/src/extension/extension.go @@ -0,0 +1,101 @@ +package extension + +import ( + "encoding/json" + "fmt" + "log" + "sync" + "strings" + uuid "github.com/google/uuid" + "github.com/ugorji/go/codec" +) + +type Extension struct { + datatypeStreamIdMap map[string]string +} + +var singleton *Extension +var once sync.Once +var extensionconfiglock sync.Mutex +var logger *log.Logger +var containerType string + +func GetInstance(flbLogger *log.Logger, containerType string) *Extension { + once.Do(func() { + singleton = &Extension{make(map[string]string)} + flbLogger.Println("Extension Instance created") + }) + logger = flbLogger + containerType = containerType + return singleton +} + +func (e *Extension) GetOutputStreamId(datatype string) string { + extensionconfiglock.Lock() + defer extensionconfiglock.Unlock() + if len(e.datatypeStreamIdMap) > 0 && e.datatypeStreamIdMap[datatype] != "" { + message := fmt.Sprintf("OutputstreamId: %s for the datatype: %s", e.datatypeStreamIdMap[datatype], datatype) + logger.Printf(message) + return e.datatypeStreamIdMap[datatype] + } + var err error + e.datatypeStreamIdMap, err = getDataTypeToStreamIdMapping() + if err != nil { + message := fmt.Sprintf("Error getting datatype to streamid mapping: %s", err.Error()) + logger.Printf(message) + } + return e.datatypeStreamIdMap[datatype] +} + +func getDataTypeToStreamIdMapping() (map[string]string, error) { + logger.Printf("extensionconfig::getDataTypeToStreamIdMapping:: getting extension config from fluent socket - start") + guid := uuid.New() + datatypeOutputStreamMap := make(map[string]string) + + taggedData := map[string]interface{}{"Request": "AgentTaggedData", "RequestId": guid.String(), "Tag": "ContainerInsights", "Version": "1"} + jsonBytes, err := json.Marshal(taggedData) + + var data []byte + enc := codec.NewEncoderBytes(&data, new(codec.MsgpackHandle)) + if err := enc.Encode(string(jsonBytes)); err != nil { + return datatypeOutputStreamMap, err + } + + fs := &FluentSocketWriter{ } + fs.sockAddress = "/var/run/mdsd/default_fluent.socket" + if containerType != "" && strings.Compare(strings.ToLower(containerType), "prometheussidecar") == 0 { + fs.sockAddress = fmt.Sprintf("/var/run/mdsd-%s/default_fluent.socket", containerType) + } + responseBytes, err := fs.WriteAndRead(data) + defer fs.disConnect() + logger.Printf("Info::mdsd::Making call to FluentSocket: %s to write and read the config data", fs.sockAddress) + if err != nil { + return datatypeOutputStreamMap, err + } + response := string(responseBytes) + + var responseObjet AgentTaggedDataResponse + err = json.Unmarshal([]byte(response), &responseObjet) + if err != nil { + logger.Printf("Error::mdsd::Failed to unmarshal config data. Error message: %s", string(err.Error())) + return datatypeOutputStreamMap, err + } + + var extensionData TaggedData + json.Unmarshal([]byte(responseObjet.TaggedData), &extensionData) + + extensionConfigs := extensionData.ExtensionConfigs + logger.Printf("Info::mdsd::build the datatype and streamid map -- start") + for _, extensionConfig := range extensionConfigs { + outputStreams := extensionConfig.OutputStreams + for dataType, outputStreamID := range outputStreams { + logger.Printf("Info::mdsd::datatype: %s, outputstreamId: %s", dataType, outputStreamID) + datatypeOutputStreamMap[dataType] = outputStreamID.(string) + } + } + logger.Printf("Info::mdsd::build the datatype and streamid map -- end") + + logger.Printf("extensionconfig::getDataTypeToStreamIdMapping:: getting extension config from fluent socket-end") + + return datatypeOutputStreamMap, nil +} diff --git a/source/plugins/go/src/extension/interfaces.go b/source/plugins/go/src/extension/interfaces.go new file mode 100644 index 000000000..c70ef17b8 --- /dev/null +++ b/source/plugins/go/src/extension/interfaces.go @@ -0,0 +1,34 @@ +package extension + +// AgentTaggedDataResponse struct for response from AgentTaggedData request +type AgentTaggedDataResponse struct { + Request string `json:"Request"` + RequestID string `json:"RequestId"` + Version string `json:"Version"` + Success bool `json:"Success"` + Description string `json:"Description"` + TaggedData string `json:"TaggedData"` +} + +// TaggedData structure for respone +type TaggedData struct { + SchemaVersion int `json:"schemaVersion"` + Version int `json:"version"` + ExtensionName string `json:"extensionName"` + ExtensionConfigs []ExtensionConfig `json:"extensionConfigurations"` + OutputStreamDefinitions map[string]StreamDefinition `json:"outputStreamDefinitions"` +} + +// StreamDefinition structure for named pipes +type StreamDefinition struct { + NamedPipe string `json:"namedPipe"` +} + +// ExtensionConfig structure for extension definition in DCR +type ExtensionConfig struct { + ID string `json:"id"` + OriginIds []string `json:"originIds"` + ExtensionSettings map[string]interface{} `json:"extensionSettings"` + InputStreams map[string]interface{} `json:"inputStreams"` + OutputStreams map[string]interface{} `json:"outputStreams"` +} diff --git a/source/plugins/go/src/extension/socket_writer.go b/source/plugins/go/src/extension/socket_writer.go new file mode 100644 index 000000000..1b16b319c --- /dev/null +++ b/source/plugins/go/src/extension/socket_writer.go @@ -0,0 +1,85 @@ +package extension + +import ( + "net" +) + +//MaxRetries for trying to write data to the socket +const MaxRetries = 5 + +//ReadBufferSize for reading data from sockets +//Current CI extension config size is ~5KB and going with 20KB to handle any future scenarios +const ReadBufferSize = 20480 + +//FluentSocketWriter writes data to AMA's default fluent socket +type FluentSocketWriter struct { + socket net.Conn + sockAddress string +} + +func (fs *FluentSocketWriter) connect() error { + c, err := net.Dial("unix", fs.sockAddress) + if err != nil { + return err + } + fs.socket = c + return nil +} + +func (fs *FluentSocketWriter) disConnect() error { + if (fs.socket != nil) { + fs.socket.Close() + fs.socket = nil + } + return nil +} + +func (fs *FluentSocketWriter) writeWithRetries(data []byte) (int, error) { + var ( + err error + n int + ) + for i := 0; i < MaxRetries; i++ { + n, err = fs.socket.Write(data) + if err == nil { + return n, nil + } + } + if err, ok := err.(net.Error); !ok || !err.Temporary() { + // so that connect() is called next time if write fails + // this happens when mdsd is restarted + _ = fs.socket.Close() // no need to log the socket closing error + fs.socket = nil + } + return 0, err +} + +func (fs *FluentSocketWriter) read() ([]byte, error) { + buf := make([]byte, ReadBufferSize) + n, err := fs.socket.Read(buf) + if err != nil { + return nil, err + } + return buf[:n], nil + +} + +func (fs *FluentSocketWriter) Write(payload []byte) (int, error) { + if fs.socket == nil { + // previous write failed with permanent error and socket was closed. + if err := fs.connect(); err != nil { + return 0, err + } + } + + return fs.writeWithRetries(payload) +} + +//WriteAndRead writes data to the socket and sends the response back +func (fs *FluentSocketWriter) WriteAndRead(payload []byte) ([]byte, error) { + _, err := fs.Write(payload) + if err != nil { + return nil, err + } + return fs.read() +} diff --git a/source/plugins/go/src/go.mod b/source/plugins/go/src/go.mod index c3e6c2044..db29a0553 100644 --- a/source/plugins/go/src/go.mod +++ b/source/plugins/go/src/go.mod @@ -23,7 +23,7 @@ require ( github.com/philhofer/fwd v1.0.0 // indirect github.com/satori/go.uuid v1.2.1-0.20181028125025-b2ce2384e17b // indirect github.com/tinylib/msgp v1.1.2 - github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5 // indirect + github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5 golang.org/x/net v0.0.0-20200421231249-e086a090c8fd // indirect golang.org/x/time v0.0.0-20161028155119-f51c12702a4d // indirect gopkg.in/inf.v0 v0.9.0 // indirect diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go new file mode 100644 index 000000000..c96685042 --- /dev/null +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -0,0 +1,516 @@ +package main + +import ( + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +const IMDSTokenPathForWindows = "c:/etc/imds-access-token/token" // only used in windows +const AMCSAgentConfigAPIVersion = "2020-08-01-preview" +const AMCSIngestionTokenAPIVersion = "2020-04-01-preview" +const MaxRetries = 3 + +var IMDSToken string +var IMDSTokenExpiration int64 + +var ConfigurationId string +var ChannelId string + +var IngestionAuthToken string +var IngestionAuthTokenExpiration int64 + +type IMDSResponse struct { + AccessToken string `json:"access_token"` + ClientID string `json:"client_id"` + ExpiresIn string `json:"expires_in"` + ExpiresOn string `json:"expires_on"` + ExtExpiresIn string `json:"ext_expires_in"` + NotBefore string `json:"not_before"` + Resource string `json:"resource"` + TokenType string `json:"token_type"` +} + +type AgentConfiguration struct { + Configurations []struct { + Configurationid string `json:"configurationId"` + Etag string `json:"eTag"` + Op string `json:"op"` + Content struct { + Datasources []struct { + Configuration struct { + Extensionname string `json:"extensionName"` + } `json:"configuration"` + ID string `json:"id"` + Kind string `json:"kind"` + Streams []struct { + Stream string `json:"stream"` + Solution string `json:"solution"` + Extensionoutputstream string `json:"extensionOutputStream"` + } `json:"streams"` + Sendtochannels []string `json:"sendToChannels"` + } `json:"dataSources"` + Channels []struct { + Endpoint string `json:"endpoint"` + ID string `json:"id"` + Protocol string `json:"protocol"` + } `json:"channels"` + Extensionconfigurations struct { + Containerinsights []struct { + ID string `json:"id"` + Originids []string `json:"originIds"` + Outputstreams struct { + LinuxPerfBlob string `json:"LINUX_PERF_BLOB"` + ContainerInventoryBlob string `json:"CONTAINER_INVENTORY_BLOB"` + ContainerLogBlob string `json:"CONTAINER_LOG_BLOB"` + ContainerinsightsContainerlogv2 string `json:"CONTAINERINSIGHTS_CONTAINERLOGV2"` + ContainerNodeInventoryBlob string `json:"CONTAINER_NODE_INVENTORY_BLOB"` + KubeEventsBlob string `json:"KUBE_EVENTS_BLOB"` + KubeHealthBlob string `json:"KUBE_HEALTH_BLOB"` + KubeMonAgentEventsBlob string `json:"KUBE_MON_AGENT_EVENTS_BLOB"` + KubeNodeInventoryBlob string `json:"KUBE_NODE_INVENTORY_BLOB"` + KubePodInventoryBlob string `json:"KUBE_POD_INVENTORY_BLOB"` + KubePvInventoryBlob string `json:"KUBE_PV_INVENTORY_BLOB"` + KubeServicesBlob string `json:"KUBE_SERVICES_BLOB"` + InsightsMetricsBlob string `json:"INSIGHTS_METRICS_BLOB"` + } `json:"outputStreams"` + } `json:"ContainerInsights"` + } `json:"extensionConfigurations"` + } `json:"content"` + } `json:"configurations"` +} + +type IngestionTokenResponse struct { + Configurationid string `json:"configurationId"` + Ingestionauthtoken string `json:"ingestionAuthToken"` +} + +func getAccessTokenFromIMDS() (string, int64, error) { + Log("Info getAccessTokenFromIMDS: start") + useIMDSTokenProxyEndPoint := os.Getenv("USE_IMDS_TOKEN_PROXY_END_POINT") + imdsAccessToken := "" + var responseBytes []byte + var err error + + if (useIMDSTokenProxyEndPoint != "" && strings.Compare(strings.ToLower(useIMDSTokenProxyEndPoint), "true") == 0) { + Log("Info Reading IMDS Access Token from IMDS Token proxy endpoint") + mcsEndpoint := os.Getenv("MCS_ENDPOINT") + msi_endpoint_string := fmt.Sprintf("http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://%s/", mcsEndpoint) + var msi_endpoint *url.URL + msi_endpoint, err := url.Parse(msi_endpoint_string) + if err != nil { + Log("getAccessTokenFromIMDS: Error creating IMDS endpoint URL: %s", err.Error()) + return imdsAccessToken, 0, err + } + req, err := http.NewRequest("GET", msi_endpoint.String(), nil) + if err != nil { + Log("getAccessTokenFromIMDS: Error creating HTTP request: %s", err.Error()) + return imdsAccessToken, 0, err + } + req.Header.Add("Metadata", "true") + + //IMDS endpoint nonroutable endpoint and requests doesnt go through proxy hence using dedicated http client + httpClient := &http.Client{Timeout: 30 * time.Second} + + // Call managed services for Azure resources token endpoint + var resp *http.Response = nil + IsSuccess := false + for retryCount := 0; retryCount < MaxRetries; retryCount++ { + resp, err = httpClient.Do(req) + if err != nil { + message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s, retryCount: %d", err.Error(), retryCount) + Log(message) + SendException(message) + continue + } + + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } + + Log("getAccessTokenFromIMDS: IMDS Response Status: %d, retryCount: %d", resp.StatusCode, retryCount) + if IsRetriableError(resp.StatusCode) { + message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + if resp.StatusCode == 429 { + if resp != nil && resp.Header.Get("Retry-After") != "" { + after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) + if err != nil && after > 0 { + retryDelay = time.Duration(after) * time.Second + } + } + } + time.Sleep(retryDelay) + continue + } else if resp.StatusCode != 200 { + message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + SendException(message) + return imdsAccessToken, 0, err + } + IsSuccess = true + break // call succeeded, don't retry any more + } + if !IsSuccess || resp == nil || resp.Body == nil { + Log("getAccessTokenFromIMDS: IMDS Request ran out of retries") + return imdsAccessToken, 0, err + } + + // Pull out response body + responseBytes, err = ioutil.ReadAll(resp.Body) + if err != nil { + Log("getAccessTokenFromIMDS: Error reading response body: %s", err.Error()) + return imdsAccessToken, 0, err + } + + } else { + Log("Info Reading IMDS Access Token from file : %s", IMDSTokenPathForWindows) + if _, err = os.Stat(IMDSTokenPathForWindows); os.IsNotExist(err) { + Log("getAccessTokenFromIMDS: IMDS token file doesnt exist: %s", err.Error()) + return imdsAccessToken, 0, err + } + //adding retries incase if we ended up reading the token file while the token file being written + for retryCount := 0; retryCount < MaxRetries; retryCount++ { + responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) + if err != nil { + Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s, retryCount: %d", err.Error(), retryCount) + time.Sleep(time.Duration((retryCount + 1) * 100) * time.Millisecond) + continue + } + break + } + } + + if responseBytes == nil { + Log("getAccessTokenFromIMDS: Error responseBytes is nil") + return imdsAccessToken, 0, err + } + + // Unmarshall response body into struct + var imdsResponse IMDSResponse + err = json.Unmarshal(responseBytes, &imdsResponse) + if err != nil { + Log("getAccessTokenFromIMDS: Error unmarshalling the response: %s", err.Error()) + return imdsAccessToken, 0, err + } + imdsAccessToken = imdsResponse.AccessToken + + expiration, err := strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) + if err != nil { + Log("getAccessTokenFromIMDS: Error parsing ExpiresOn field from IMDS response: %s", err.Error()) + return imdsAccessToken, 0, err + } + Log("Info getAccessTokenFromIMDS: end") + return imdsAccessToken, expiration, nil +} + +func getAgentConfiguration(imdsAccessToken string) (configurationId string, channelId string, err error) { + Log("Info getAgentConfiguration: start") + configurationId = "" + channelId = "" + var amcs_endpoint *url.URL + osType := os.Getenv("OS_TYPE") + resourceId := os.Getenv("AKS_RESOURCE_ID") + resourceRegion := os.Getenv("AKS_REGION") + mcsEndpoint := os.Getenv("MCS_ENDPOINT") + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, osType, AMCSAgentConfigAPIVersion) + amcs_endpoint, err = url.Parse(amcs_endpoint_string) + if err != nil { + Log("getAgentConfiguration: Error creating AMCS endpoint URL: %s", err.Error()) + return configurationId, channelId, err + } + + var bearer = "Bearer " + imdsAccessToken + // Create a new request using http + req, err := http.NewRequest("GET", amcs_endpoint.String(), nil) + if err != nil { + message := fmt.Sprintf("getAgentConfiguration: Error creating HTTP request for AMCS endpoint: %s", err.Error()) + Log(message) + return configurationId, channelId, err + } + req.Header.Set("Authorization", bearer) + + var resp *http.Response = nil + IsSuccess := false + for retryCount := 0; retryCount < MaxRetries; retryCount++ { + resp, err = HTTPClient.Do(req) + if err != nil { + message := fmt.Sprintf("getAgentConfiguration: Error calling AMCS endpoint: %s", err.Error()) + Log(message) + SendException(message) + continue + } + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } + Log("getAgentConfiguration Response Status: %d", resp.StatusCode) + if IsRetriableError(resp.StatusCode) { + message := fmt.Sprintf("getAgentConfiguration: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + if resp.StatusCode == 429 { + if resp != nil && resp.Header.Get("Retry-After") != "" { + after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) + if err != nil && after > 0 { + retryDelay = time.Duration(after) * time.Second + } + } + } + time.Sleep(retryDelay) + continue + } else if resp.StatusCode != 200 { + message := fmt.Sprintf("getAgentConfiguration: Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + SendException(message) + return configurationId, channelId, err + } + IsSuccess = true + break // call succeeded, don't retry any more + } + if !IsSuccess || resp == nil || resp.Body == nil { + message := fmt.Sprintf("getAgentConfiguration Request ran out of retries") + Log(message) + SendException(message) + return configurationId, channelId, err + } + responseBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + Log("getAgentConfiguration: Error reading response body from AMCS API call: %s", err.Error()) + return configurationId, channelId, err + } + + // Unmarshall response body into struct + var agentConfiguration AgentConfiguration + err = json.Unmarshal(responseBytes, &agentConfiguration) + if err != nil { + message := fmt.Sprintf("getAgentConfiguration: Error unmarshalling the response: %s", err.Error()) + Log(message) + SendException(message) + return configurationId, channelId, err + } + + if len(agentConfiguration.Configurations) == 0 { + message := "getAgentConfiguration: Received empty agentConfiguration.Configurations array" + Log(message) + SendException(message) + return configurationId, channelId, err + } + + if len(agentConfiguration.Configurations[0].Content.Channels) == 0 { + message := "getAgentConfiguration: Received empty agentConfiguration.Configurations[0].Content.Channels" + Log(message) + SendException(message) + return configurationId, channelId, err + } + + configurationId = agentConfiguration.Configurations[0].Configurationid + channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID + + Log("getAgentConfiguration: obtained configurationId: %s, channelId: %s", configurationId, channelId) + Log("Info getAgentConfiguration: end") + + return configurationId, channelId, nil +} + +func getIngestionAuthToken(imdsAccessToken string, configurationId string, channelId string) (ingestionAuthToken string, refreshInterval int64, err error) { + Log("Info getIngestionAuthToken: start") + ingestionAuthToken = "" + refreshInterval = 0 + var amcs_endpoint *url.URL + osType := os.Getenv("OS_TYPE") + resourceId := os.Getenv("AKS_RESOURCE_ID") + resourceRegion := os.Getenv("AKS_REGION") + mcsEndpoint := os.Getenv("MCS_ENDPOINT") + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, configurationId, channelId, osType, AMCSIngestionTokenAPIVersion) + amcs_endpoint, err = url.Parse(amcs_endpoint_string) + if err != nil { + Log("getIngestionAuthToken: Error creating AMCS endpoint URL: %s", err.Error()) + return ingestionAuthToken, refreshInterval, err + } + + var bearer = "Bearer " + imdsAccessToken + // Create a new request using http + req, err := http.NewRequest("GET", amcs_endpoint.String(), nil) + if err != nil { + Log("getIngestionAuthToken: Error creating HTTP request for AMCS endpoint: %s", err.Error()) + return ingestionAuthToken, refreshInterval, err + } + + // add authorization header to the req + req.Header.Add("Authorization", bearer) + + var resp *http.Response = nil + IsSuccess := false + for retryCount := 0; retryCount < MaxRetries; retryCount++ { + // Call managed services for Azure resources token endpoint + resp, err = HTTPClient.Do(req) + if err != nil { + message := fmt.Sprintf("getIngestionAuthToken: Error calling AMCS endpoint for ingestion auth token: %s", err.Error()) + Log(message) + SendException(message) + resp = nil + continue + } + + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } + + Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) + if IsRetriableError(resp.StatusCode) { + message := fmt.Sprintf("getIngestionAuthToken: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + if resp.StatusCode == 429 { + if resp != nil && resp.Header.Get("Retry-After") != "" { + after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) + if err != nil && after > 0 { + retryDelay = time.Duration(after) * time.Second + } + } + } + time.Sleep(retryDelay) + continue + } else if resp.StatusCode != 200 { + message := fmt.Sprintf("getIngestionAuthToken: Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + SendException(message) + return ingestionAuthToken, refreshInterval, err + } + IsSuccess = true + break + } + + if !IsSuccess || resp == nil || resp.Body == nil { + message := "getIngestionAuthToken: ran out of retries calling AMCS for ingestion token" + Log(message) + SendException(message) + return ingestionAuthToken, refreshInterval, err + } + + // Pull out response body + responseBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + Log("getIngestionAuthToken: Error reading response body from AMCS Ingestion API call : %s", err.Error()) + return ingestionAuthToken, refreshInterval, err + } + + // Unmarshall response body into struct + var ingestionTokenResponse IngestionTokenResponse + err = json.Unmarshal(responseBytes, &ingestionTokenResponse) + if err != nil { + Log("getIngestionAuthToken: Error unmarshalling the response: %s", err.Error()) + return ingestionAuthToken, refreshInterval, err + } + + ingestionAuthToken = ingestionTokenResponse.Ingestionauthtoken + + refreshInterval, err = getTokenRefreshIntervalFromAmcsResponse(resp.Header) + if err != nil { + Log("getIngestionAuthToken: Error failed to parse max-age response header") + return ingestionAuthToken, refreshInterval, err + } + Log("getIngestionAuthToken: refresh interval %d seconds", refreshInterval) + + Log("Info getIngestionAuthToken: end") + return ingestionAuthToken, refreshInterval, nil +} + +var cacheControlHeaderRegex = regexp.MustCompile(`max-age=([0-9]+)`) + +func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterval int64, err error) { + cacheControlHeader, valueInMap := header["Cache-Control"] + if !valueInMap { + return 0, errors.New("getTokenRefreshIntervalFromAmcsResponse: Cache-Control not in passed header") + } + + for _, entry := range cacheControlHeader { + match := cacheControlHeaderRegex.FindStringSubmatch(entry) + if len(match) == 2 { + interval := 0 + interval, err = strconv.Atoi(match[1]) + if err != nil { + Log("getTokenRefreshIntervalFromAmcsResponse: error getting timeout from auth token. Header: " + strings.Join(cacheControlHeader, ",")) + return 0, err + } + refreshInterval = int64(interval) + return refreshInterval, nil + } + } + + return 0, errors.New("getTokenRefreshIntervalFromAmcsResponse: didn't find max-age in response header") +} + +func refreshIngestionAuthToken() { + for ; true; <-IngestionAuthTokenRefreshTicker.C { + if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // token valid 24 hrs and refresh token 1 hr before expiry + imdsToken, imdsTokenExpiry, err := getAccessTokenFromIMDS() + if err != nil { + message := fmt.Sprintf("refreshIngestionAuthToken: Error on getAccessTokenFromIMDS %s \n", err.Error()) + Log(message) + SendException(message) + } else { + IMDSToken = imdsToken + IMDSTokenExpiration = imdsTokenExpiry + } + } + if IMDSToken == "" { + message := "refreshIngestionAuthToken: IMDSToken is empty" + Log(message) + SendException(message) + continue + } + var err error + // ignore agent configuration expiring, the configuration and channel IDs will never change (without creating an agent restart) + if ConfigurationId == "" || ChannelId == "" { + ConfigurationId, ChannelId, err = getAgentConfiguration(IMDSToken) + if err != nil { + message := fmt.Sprintf("refreshIngestionAuthToken: Error getAgentConfiguration %s \n", err.Error()) + Log(message) + SendException(message) + continue + } + } + if IMDSToken == "" || ConfigurationId == "" || ChannelId == "" { + message := "refreshIngestionAuthToken: IMDSToken or ConfigurationId or ChannelId empty" + Log(message) + SendException(message) + continue + } + ingestionAuthToken, refreshIntervalInSeconds, err := getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) + if err != nil { + message := fmt.Sprintf("refreshIngestionAuthToken: Error getIngestionAuthToken %s \n", err.Error()) + Log(message) + SendException(message) + continue + } + IngestionAuthTokenUpdateMutex.Lock() + ODSIngestionAuthToken = ingestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() + if refreshIntervalInSeconds > 0 && refreshIntervalInSeconds != defaultIngestionAuthTokenRefreshIntervalSeconds { + //TODO - use Reset which is better when go version upgraded to 1.15 or up rather Stop() and NewTicker + //IngestionAuthTokenRefreshTicker.Reset(time.Second * time.Duration(refreshIntervalInSeconds)) + IngestionAuthTokenRefreshTicker.Stop() + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(refreshIntervalInSeconds)) + } + } +} + +func IsRetriableError(httpStatusCode int) bool { + retryableStatusCodes := [5]int{408, 429, 502, 503, 504} + for _, code := range retryableStatusCodes { + if code == httpStatusCode { + return true + } + } + return false +} diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 217ba1efc..0761ef664 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -22,6 +22,7 @@ import ( "github.com/tinylib/msgp/msgp" lumberjack "gopkg.in/natefinch/lumberjack.v2" + "Docker-Provider/source/plugins/go/src/extension" "github.com/Azure/azure-kusto-go/kusto/ingest" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -88,6 +89,7 @@ const IPName = "ContainerInsights" const defaultContainerInventoryRefreshInterval = 60 const kubeMonAgentConfigEventFlushInterval = 60 +const defaultIngestionAuthTokenRefreshIntervalSeconds = 3600 //Eventsource name in mdsd const MdsdContainerLogSourceName = "ContainerLogSource" @@ -106,6 +108,11 @@ const ContainerLogsV1Route = "v1" //container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) const ContainerLogV2SchemaVersion = "v2" +//env variable for AAD MSI Auth mode +const AADMSIAuthMode = "AAD_MSI_AUTH_MODE" + +// Tag prefix of mdsd output streamid for AMA in MSI auth mode +const MdsdOutputStreamIdTagPrefix = "dcr-" //env variable to container type const ContainerTypeEnv = "CONTAINER_TYPE" @@ -168,7 +175,9 @@ var ( // flag to check if its Windows OS IsWindows bool // container type - ContainerType string + ContainerType string + // flag to check whether LA AAD MSI Auth Enabled or not + IsAADMSIAuthMode bool ) var ( @@ -194,6 +203,10 @@ var ( EventHashUpdateMutex = &sync.Mutex{} // parent context used by ADX uploader ParentContext = context.Background() + // IngestionAuthTokenUpdateMutex read and write mutex access for ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex = &sync.Mutex{} + // ODSIngestionAuthToken for windows agent AAD MSI Auth + ODSIngestionAuthToken string ) var ( @@ -201,6 +214,8 @@ var ( ContainerImageNameRefreshTicker *time.Ticker // KubeMonAgentConfigEventsSendTicker to send config events every hour KubeMonAgentConfigEventsSendTicker *time.Ticker + // IngestionAuthTokenRefreshTicker to refresh ingestion token + IngestionAuthTokenRefreshTicker *time.Ticker ) var ( @@ -702,7 +717,11 @@ func flushKubeMonAgentEventRecords() { } } } - if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdKubeMonAgentEventsTagName, MdsdOutputStreamIdTagPrefix) == false { + Log("Info::mdsd::obtaining output stream id for data type: %s", KubeMonAgentEventDataType) + MdsdKubeMonAgentEventsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(KubeMonAgentEventDataType) + } Log("Info::mdsd:: using mdsdsource name for KubeMonAgentEvents: %s", MdsdKubeMonAgentEventsTagName) msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdKubeMonAgentEventsTagName, msgPackEntries) if MdsdKubeMonMsgpUnixSocketClient == nil { @@ -760,6 +779,16 @@ func flushKubeMonAgentEventRecords() { req.Header.Set("x-ms-AzureResourceId", ResourceID) } + if IsAADMSIAuthMode == true { + IngestionAuthTokenUpdateMutex.Lock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() + if ingestionAuthToken == "" { + Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") + } + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + } + resp, err := HTTPClient.Do(req) elapsed = time.Since(start) @@ -904,7 +933,11 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int } } } - if (len(msgPackEntries) > 0) { + if (len(msgPackEntries) > 0) { + if IsAADMSIAuthMode == true && (strings.HasPrefix(MdsdInsightsMetricsTagName, MdsdOutputStreamIdTagPrefix) == false) { + Log("Info::mdsd::obtaining output stream id for InsightsMetricsDataType since Log Analytics AAD MSI Auth Enabled") + MdsdInsightsMetricsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(InsightsMetricsDataType) + } msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) if MdsdInsightsMetricsMsgpUnixSocketClient == nil { Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") @@ -979,6 +1012,18 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if ResourceCentric == true { req.Header.Set("x-ms-AzureResourceId", ResourceID) } + if IsAADMSIAuthMode == true { + IngestionAuthTokenUpdateMutex.Lock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() + if ingestionAuthToken == "" { + message := "Error::ODS Ingestion Auth Token is empty. Please check error log." + Log(message) + return output.FLB_RETRY + } + // add authorization header to the req + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + } start := time.Now() resp, err := HTTPClient.Do(req) @@ -1184,6 +1229,16 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { //flush to mdsd + if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdContainerLogTagName, MdsdOutputStreamIdTagPrefix) == false { + Log("Info::mdsd::obtaining output stream id") + if ContainerLogSchemaV2 == true { + MdsdContainerLogTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(ContainerLogV2DataType) + } else { + MdsdContainerLogTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(ContainerLogDataType) + } + Log("Info::mdsd:: using mdsdsource name: %s", MdsdContainerLogTagName) + } + fluentForward := MsgPackForward{ Tag: MdsdContainerLogTagName, Entries: msgPackEntries, @@ -1343,6 +1398,18 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { req.Header.Set("x-ms-AzureResourceId", ResourceID) } + if IsAADMSIAuthMode == true { + IngestionAuthTokenUpdateMutex.Lock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() + if ingestionAuthToken == "" { + Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") + return output.FLB_RETRY + } + // add authorization header to the req + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + } + resp, err := HTTPClient.Do(req) elapsed = time.Since(start) @@ -1439,8 +1506,7 @@ func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, str } // InitializePlugin reads and populates plugin configuration -func InitializePlugin(pluginConfPath string, agentVersion string) { - +func InitializePlugin(pluginConfPath string, agentVersion string) { go func() { isTest := os.Getenv("ISTEST") if strings.Compare(strings.ToLower(strings.TrimSpace(isTest)), "true") == 0 { @@ -1541,6 +1607,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } Log("OMSEndpoint %s", OMSEndpoint) + IsAADMSIAuthMode = false + if strings.Compare(strings.ToLower(os.Getenv(AADMSIAuthMode)), "true") == 0 { + IsAADMSIAuthMode = true + Log("AAD MSI Auth Mode Configured") + } ResourceID = os.Getenv(envAKSResourceID) if len(ResourceID) > 0 { @@ -1712,5 +1783,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName - MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName -} \ No newline at end of file + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName + Log("ContainerLogsRouteADX: %v, IsWindows: %v, IsAADMSIAuthMode = %v \n", ContainerLogsRouteADX, IsWindows, IsAADMSIAuthMode) + if !ContainerLogsRouteADX && IsWindows && IsAADMSIAuthMode { + Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) + go refreshIngestionAuthToken() + } +} diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 3fe5c6d0e..02d30607e 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -63,27 +63,32 @@ func ReadConfiguration(filename string) (map[string]string, error) { // CreateHTTPClient used to create the client for sending post requests to OMSEndpoint func CreateHTTPClient() { - certFilePath := PluginConfiguration["cert_file_path"] - keyFilePath := PluginConfiguration["key_file_path"] - if IsWindows == false { - certFilePath = fmt.Sprintf(certFilePath, WorkspaceID) - keyFilePath = fmt.Sprintf(keyFilePath, WorkspaceID) - } - cert, err := tls.LoadX509KeyPair(certFilePath, keyFilePath) - if err != nil { - message := fmt.Sprintf("Error when loading cert %s", err.Error()) - SendException(message) - time.Sleep(30 * time.Second) - Log(message) - log.Fatalf("Error when loading cert %s", err.Error()) - } + var transport *http.Transport + if IsAADMSIAuthMode { + transport = &http.Transport{} + } else { + certFilePath := PluginConfiguration["cert_file_path"] + keyFilePath := PluginConfiguration["key_file_path"] + if IsWindows == false { + certFilePath = fmt.Sprintf(certFilePath, WorkspaceID) + keyFilePath = fmt.Sprintf(keyFilePath, WorkspaceID) + } + cert, err := tls.LoadX509KeyPair(certFilePath, keyFilePath) + if err != nil { + message := fmt.Sprintf("Error when loading cert %s", err.Error()) + SendException(message) + time.Sleep(30 * time.Second) + Log(message) + log.Fatalf("Error when loading cert %s", err.Error()) + } - tlsConfig := &tls.Config{ - Certificates: []tls.Certificate{cert}, - } + tlsConfig := &tls.Config{ + Certificates: []tls.Certificate{cert}, + } - tlsConfig.BuildNameToCertificate() - transport := &http.Transport{TLSClientConfig: tlsConfig} + tlsConfig.BuildNameToCertificate() + transport = &http.Transport{TLSClientConfig: tlsConfig} + } // set the proxy if the proxy configured if ProxyEndpoint != "" { proxyEndpointUrl, err := url.Parse(ProxyEndpoint) diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index 31f9503cd..eaa1d903d 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -21,6 +21,8 @@ class ApplicationInsightsUtility @@EnvApplicationInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" @@EnvControllerType = "CONTROLLER_TYPE" @@EnvContainerRuntime = "CONTAINER_RUNTIME" + @@EnvAADMSIAuthMode = "AAD_MSI_AUTH_MODE" + @@isWindows = false @@hostName = (OMS::Common.get_hostname) @@os_type = ENV["OS_TYPE"] @@ -82,7 +84,12 @@ def initializeUtility() isProxyConfigured = false $log.info("proxy is not configured") end - + aadAuthMSIMode = ENV[@@EnvAADMSIAuthMode] + if !aadAuthMSIMode.nil? && !aadAuthMSIMode.empty? && aadAuthMSIMode.downcase == "true".downcase + @@CustomProperties["aadAuthMSIMode"] = "true" + else + @@CustomProperties["aadAuthMSIMode"] = "false" + end #Check if telemetry is turned off telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase diff --git a/source/plugins/ruby/CustomMetricsUtils.rb b/source/plugins/ruby/CustomMetricsUtils.rb index 220313e6b..fd9290b78 100644 --- a/source/plugins/ruby/CustomMetricsUtils.rb +++ b/source/plugins/ruby/CustomMetricsUtils.rb @@ -13,8 +13,8 @@ def check_custom_metrics_availability if aks_region.to_s.empty? || aks_resource_id.to_s.empty? return false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set end - - return aks_cloud_environment.to_s.downcase == 'public' + + return aks_cloud_environment.to_s.downcase == 'azurepubliccloud' end end end \ No newline at end of file diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index c40d4c357..40fa80c14 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -106,5 +106,28 @@ class Constants #Pod Statuses POD_STATUS_TERMINATING = "Terminating" + # Data type ids + CONTAINER_INVENTORY_DATA_TYPE = "CONTAINER_INVENTORY_BLOB" + CONTAINER_NODE_INVENTORY_DATA_TYPE = "CONTAINER_NODE_INVENTORY_BLOB" + PERF_DATA_TYPE = "LINUX_PERF_BLOB" + INSIGHTS_METRICS_DATA_TYPE = "INSIGHTS_METRICS_BLOB" + KUBE_SERVICES_DATA_TYPE = "KUBE_SERVICES_BLOB" + KUBE_POD_INVENTORY_DATA_TYPE = "KUBE_POD_INVENTORY_BLOB" + KUBE_NODE_INVENTORY_DATA_TYPE = "KUBE_NODE_INVENTORY_BLOB" + KUBE_PV_INVENTORY_DATA_TYPE = "KUBE_PV_INVENTORY_BLOB" + KUBE_EVENTS_DATA_TYPE = "KUBE_EVENTS_BLOB" + KUBE_MON_AGENT_EVENTS_DATA_TYPE = "KUBE_MON_AGENT_EVENTS_BLOB" + KUBE_HEALTH_DATA_TYPE = "KUBE_HEALTH_BLOB" + CONTAINERLOGV2_DATA_TYPE = "CONTAINERINSIGHTS_CONTAINERLOGV2" + CONTAINERLOG_DATA_TYPE = "CONTAINER_LOG_BLOB" + + #ContainerInsights Extension (AMCS) + CI_EXTENSION_NAME = "ContainerInsights" + CI_EXTENSION_VERSION = "1" + #Current CI extension config size is ~5KB and going with 20KB to handle any future scenarios + CI_EXTENSION_CONFIG_MAX_BYTES = 20480 + ONEAGENT_FLUENT_SOCKET_NAME = "/var/run/mdsd/default_fluent.socket" + #Tag prefix for output stream + EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX = "dcr-" end diff --git a/source/plugins/ruby/filter_health_model_builder.rb b/source/plugins/ruby/filter_health_model_builder.rb index d491f17c2..9decda881 100644 --- a/source/plugins/ruby/filter_health_model_builder.rb +++ b/source/plugins/ruby/filter_health_model_builder.rb @@ -4,11 +4,12 @@ require 'fluent/plugin/filter' -module Fluent::Plugin +module Fluent::Plugin + require_relative 'extension_utils' require 'logger' require 'yajl/json_gem' Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } - + class FilterHealthModelBuilder < Filter include HealthModel @@ -22,7 +23,7 @@ class FilterHealthModelBuilder < Filter attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator, :telemetry - + @@cluster_id = KubernetesApiClient.getClusterId @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @@ -56,7 +57,7 @@ def initialize deserialized_state_info = @cluster_health_state.get_state @state.initialize_state(deserialized_state_info) end - + rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -90,7 +91,14 @@ def filter_stream(tag, es) end begin new_es = Fluent::MultiEventStream.new - time = Time.now + time = Time.now + if ExtensionUtils.isAADMSIAuthMode() + $log.info("filter_health_model_builder::enumerate: AAD AUTH MSI MODE") + if @rewrite_tag.nil? || !@rewrite_tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @rewrite_tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_HEALTH_DATA_TYPE) + end + $log.info("filter_health_model_builder::filter_stream: using tag -#{@rewrite_tag} @ #{Time.now.utc.iso8601}") + end if tag.start_with?("kubehealth.DaemonSet.Node") node_records = [] @@ -222,7 +230,7 @@ def filter_stream(tag, es) @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" - + # for each key in monitor.keys, # get the state from health_monitor_state # generate the record to send @@ -245,7 +253,7 @@ def filter_stream(tag, es) @cluster_new_state = new_state end end - end + end new_es.add(emit_time, record) } @@ -261,7 +269,7 @@ def filter_stream(tag, es) @telemetry.send # return an empty event stream, else the match will throw a NoMethodError return Fluent::MultiEventStream.new - elsif tag.start_with?(@rewrite_tag) + elsif tag.start_with?(@rewrite_tag) # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream es else @@ -273,6 +281,6 @@ def filter_stream(tag, es) @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" return nil end - end + end end end diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index b3f9bd08b..862e88e44 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -20,7 +20,8 @@ def initialize require_relative "CAdvisorMetricsAPIClient" require_relative "oms_common" require_relative "omslog" - require_relative "constants" + require_relative "constants" + require_relative "extension_utils" end config_param :run_interval, :time, :default => 60 @@ -61,13 +62,24 @@ def enumerate() batchTime = currentTime.utc.iso8601 @@istestvar = ENV["ISTEST"] begin - eventStream = Fluent::MultiEventStream.new + eventStream = Fluent::MultiEventStream.new insightsMetricsEventStream = Fluent::MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) - metricData.each do |record| - eventStream.add(time, record) if record - end - + metricData.each do |record| + eventStream.add(time, record) if record + end + + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_cadvisor_perf::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @insightsmetricstag.nil? || !@insightsmetricstag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsmetricstag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsmetricstag} @ #{Time.now.utc.iso8601}") + end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream router.emit_stream(@containerhealthtag, eventStream) if eventStream @@ -136,6 +148,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # CAdvisor_Perf_Input end # module diff --git a/source/plugins/ruby/in_containerinventory.rb b/source/plugins/ruby/in_containerinventory.rb index eebf422d6..9fcb7ab90 100644 --- a/source/plugins/ruby/in_containerinventory.rb +++ b/source/plugins/ruby/in_containerinventory.rb @@ -7,17 +7,18 @@ module Fluent::Plugin class Container_Inventory_Input < Input Fluent::Plugin.register_input("containerinventory", self) - @@PluginName = "ContainerInventory" + @@PluginName = "ContainerInventory" def initialize super require "yajl/json_gem" - require "time" + require "time" require_relative "ContainerInventoryState" require_relative "ApplicationInsightsUtility" require_relative "omslog" require_relative "CAdvisorMetricsAPIClient" - require_relative "kubernetes_container_inventory" + require_relative "kubernetes_container_inventory" + require_relative "extension_utils" end config_param :run_interval, :time, :default => 60 @@ -47,21 +48,28 @@ def shutdown @thread.join super # This super must be at the end of shutdown method end - end - + end + def enumerate - currentTime = Time.now + currentTime = Time.now batchTime = currentTime.utc.iso8601 emitTime = Fluent::Engine.now containerInventory = Array.new eventStream = Fluent::MultiEventStream.new hostName = "" - $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") + $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_container_inventory::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) + end + $log.info("in_container_inventory::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") + end begin containerRuntimeEnv = ENV["CONTAINER_RUNTIME"] $log.info("in_container_inventory::enumerate : container runtime : #{containerRuntimeEnv}") clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] - $log.info("in_container_inventory::enumerate : using cadvisor apis") + $log.info("in_container_inventory::enumerate : using cadvisor apis") containerIds = Array.new response = CAdvisorMetricsAPIClient.getPodsFromCAdvisor(winNode: nil) if !response.nil? && !response.body.nil? @@ -76,10 +84,10 @@ def enumerate end containerIds.push containerRecord["InstanceID"] containerInventory.push containerRecord - end + end end - end - end + end + end # Update the state for deleted containers deletedContainers = ContainerInventoryState.getDeletedContainers(containerIds) if !deletedContainers.nil? && !deletedContainers.empty? @@ -87,13 +95,13 @@ def enumerate container = ContainerInventoryState.readContainerState(deletedContainer) if !container.nil? container.each { |k, v| container[k] = v } - container["State"] = "Deleted" + container["State"] = "Deleted" KubernetesContainerInventory.deleteCGroupCacheEntryForDeletedContainer(container["InstanceID"]) containerInventory.push container end end - end - containerInventory.each do |record| + end + containerInventory.each do |record| eventStream.add(emitTime, record) if record end router.emit_stream(@tag, eventStream) if eventStream @@ -148,6 +156,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # Container_Inventory_Input end # module diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 6f65dab92..deeae6e14 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -3,7 +3,7 @@ require 'fluent/plugin/input' -module Fluent::Plugin +module Fluent::Plugin class Kube_Event_Input < Input Fluent::Plugin.register_input("kube_events", self) @@KubeEventsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml" @@ -18,6 +18,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" + require_relative "extension_utils" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -37,7 +38,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 @@ -84,8 +85,15 @@ def enumerate batchTime = currentTime.utc.iso8601 eventQueryState = getEventQueryState newEventQueryState = [] - @eventsCount = 0 - + @eventsCount = 0 + + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_events::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_EVENTS_DATA_TYPE) + end + $log.info("in_kube_events::enumerate: using kubeevents tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") @@ -131,8 +139,8 @@ def enumerate end # end enumerate def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now @@istestvar = ENV["ISTEST"] begin eventStream = Fluent::MultiEventStream.new @@ -166,7 +174,7 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim record["Count"] = items["count"] record["Computer"] = nodeName record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterId"] = KubernetesApiClient.getClusterId eventStream.add(emitTime, record) if record @eventsCount += 1 end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index ebfa903fd..bc62756a1 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -6,12 +6,12 @@ module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - + @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - + @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@ -35,11 +35,12 @@ def initialize require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "oms_common" - require_relative "omslog" + require_relative "omslog" + require_relative "extension_utils" - @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" - @MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" + @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" # refer tomlparser-agent-config for the defaults @@ -60,7 +61,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 @@ -109,8 +110,27 @@ def enumerate @nodesAPIE2ELatencyMs = 0 @nodeInventoryE2EProcessingLatencyMs = 0 - nodeInventoryStartTime = (Time.now.to_f * 1000).to_i - + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_nodes::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if @ContainerNodeInventoryTag.nil? || !@ContainerNodeInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @ContainerNodeInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_NODE_INVENTORY_DATA_TYPE) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + end nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil @@ -161,19 +181,19 @@ def enumerate def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) begin - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now telemetrySent = false eventStream = Fluent::MultiEventStream.new containerNodeInventoryEventStream = Fluent::MultiEventStream.new insightsMetricsEventStream = Fluent::MultiEventStream.new - kubePerfEventStream = Fluent::MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] #get node inventory nodeInventory["items"].each do |item| # node inventory nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) - eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord + eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream @@ -186,7 +206,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end # container node inventory - containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE @@ -235,7 +255,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) @NodeCache.mem.set_capacity(nodeMetricRecord["Host"], metricVal) end end - nodeMetricRecords.each do |metricRecord| + nodeMetricRecords.each do |metricRecord| kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE @@ -265,7 +285,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) end - nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE @@ -335,7 +355,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - eventStream = nil + eventStream = nil end if containerNodeInventoryEventStream.count > 0 $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") @@ -507,7 +527,7 @@ def getNodeTelemetryProps(item) $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end return properties - end + end end # Kube_Node_Input class NodeStatsCache # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) @@ -578,5 +598,5 @@ def cpu() def mem() return @@memCache end - end + end end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 5598602cd..3f5f4f1cc 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -11,7 +11,7 @@ class Kube_PodInventory_Input < Input @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) - + def initialize super @@ -27,6 +27,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + require_relative "extension_utils" # refer tomlparser-agent-config for updating defaults # this configurable via configmap @@ -39,12 +40,12 @@ def initialize @winContainerCount = 0 @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 - @podsAPIE2ELatencyMs = 0 - + @podsAPIE2ELatencyMs = 0 + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -55,7 +56,7 @@ def configure(conf) @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end - def start + def start if @run_interval super if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 @@ -107,7 +108,30 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 - podInventoryStartTime = (Time.now.to_f * 1000).to_i + podInventoryStartTime = (Time.now.to_f * 1000).to_i + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) + end + if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") @@ -197,8 +221,8 @@ def enumerate(podList = nil) end def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now #batchTime = currentTime.utc.iso8601 eventStream = Fluent::MultiEventStream.new containerInventoryStream = Fluent::MultiEventStream.new @@ -214,8 +238,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) podInventoryRecords.each do |record| if !record.nil? - eventStream.add(emitTime, record) if record - @inventoryToMdmConvertor.process_pod_inventory_record(record) + eventStream.add(emitTime, record) if record + @inventoryToMdmConvertor.process_pod_inventory_record(record) end end # Setting this flag to true so that we can send ContainerInventory records for containers @@ -232,7 +256,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc # Send container inventory records for containers on windows nodes @winContainerCount += containerInventoryRecords.length containerInventoryRecords.each do |cirecord| - if !cirecord.nil? + if !cirecord.nil? containerInventoryStream.add(emitTime, cirecord) if cirecord end end @@ -255,7 +279,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - containerMetricDataItems.each do |record| + containerMetricDataItems.each do |record| kubePerfEventStream.add(emitTime, record) if record end @@ -274,7 +298,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end @@ -341,7 +365,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if !kubeServiceRecord.nil? # adding before emit to reduce memory foot print kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId - kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName kubeServicesEventStream.add(emitTime, kubeServiceRecord) if kubeServiceRecord if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") @@ -648,6 +672,6 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName - end + end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 40eebac8a..fccfd459d 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -20,6 +20,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + require_relative "extension_utils" # Response size is around 1500 bytes per PV @PV_CHUNK_SIZE = "5000" @@ -33,7 +34,7 @@ def configure(conf) super end - def start + def start if @run_interval super @finished = false @@ -61,7 +62,13 @@ def enumerate telemetryFlush = false @pvTypeToCountHash = {} currentTime = Time.now - batchTime = currentTime.utc.iso8601 + batchTime = currentTime.utc.iso8601 + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_pvinventory::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_PV_INVENTORY_DATA_TYPE) + end + end continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") @@ -93,7 +100,7 @@ def enumerate if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) telemetryFlush = true end - + # Flush AppInsights telemetry once all the processing is done if telemetryFlush == true telemetryProperties = {} @@ -110,8 +117,8 @@ def enumerate end # end enumerate def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now eventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] begin @@ -152,8 +159,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end records.each do |record| - if !record.nil? - eventStream.add(emitTime, record) + if !record.nil? + eventStream.add(emitTime, record) end end @@ -191,7 +198,7 @@ def getTypeInfo(item) begin if !item["spec"].nil? (Constants::PV_TYPES).each do |pvType| - + # PV is this type if !item["spec"][pvType].nil? @@ -252,6 +259,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # Kube_PVInventory_Input end # module diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index 182c3ffc1..0b563a890 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -22,6 +22,7 @@ def initialize require_relative "omslog" require_relative "ApplicationInsightsUtility" require_relative "constants" + require_relative "extension_utils" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -44,7 +45,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 @@ -55,11 +56,11 @@ def start @DEPLOYMENTS_CHUNK_SIZE = 500 end $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") - + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + @thread = Thread.new(&method(:run_periodic)) end end @@ -81,8 +82,14 @@ def enumerate batchTime = currentTime.utc.iso8601 #set the running total for this batch to 0 - @deploymentsRunningTotal = 0 - + @deploymentsRunningTotal = 0 + + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kubestate_deployments::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") @@ -186,7 +193,7 @@ def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) end time = Fluent::Engine.now - metricItems.each do |insightsMetricsRecord| + metricItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end @@ -233,6 +240,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index 8f60bfb72..178f7944f 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -18,7 +18,8 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" - require_relative "constants" + require_relative "constants" + require_relative "extension_utils" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -41,7 +42,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 @@ -78,7 +79,14 @@ def enumerate batchTime = currentTime.utc.iso8601 @hpaCount = 0 - + + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kubestate_hpa::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_kubestate_hpa::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") @@ -186,7 +194,7 @@ def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) end time = Fluent::Engine.now - metricItems.each do |insightsMetricsRecord| + metricItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end @@ -231,6 +239,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 9ab2474b1..dd462fdf2 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -20,6 +20,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + require_relative "extension_utils" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end @@ -58,6 +59,17 @@ def enumerate() timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@istestvar = ENV["ISTEST"] + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_win_cadvisor_perf::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_win_cadvisor_perf::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_win_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + end #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 8e80fb753..82d6e07db 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -21,6 +21,9 @@ def initialize require_relative "proxy_utils" @@token_resource_url = "https://monitoring.azure.com/" + # AAD auth supported only in public cloud and handle other clouds when enabled + # this is unified new token audience for LA AAD MSI auth & metrics + @@token_resource_audience = "https://monitor.azure.com/" @@grant_type = "client_credentials" @@azure_json_path = "/etc/kubernetes/host/azure.json" @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" @@ -28,6 +31,8 @@ def initialize # msiEndpoint is the well known endpoint for getting MSI authentications tokens @@msi_endpoint_template = "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%{user_assigned_client_id}&resource=%{resource}" + # IMDS msiEndpoint for AAD MSI Auth is the proxy endpoint whcih serves the MSI auth tokens with resource claim + @@imds_msi_endpoint_template = "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=%{resource}" @@user_assigned_client_id = ENV["USER_ASSIGNED_IDENTITY_CLIENT_ID"] @@plugin_name = "AKSCustomMetricsMDM" @@ -46,6 +51,7 @@ def initialize @last_telemetry_sent_time = nil # Setting useMsi to false by default @useMsi = false + @isAADMSIAuth = false @metrics_flushed_count = 0 @cluster_identity = nil @@ -124,7 +130,14 @@ def start @parsed_token_uri = URI.parse(aad_token_url) else @useMsi = true - msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } + if !@@user_assigned_client_id.nil? && !@@user_assigned_client_id.empty? + msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } + else + # in case of aad msi auth user_assigned_client_id will be empty + @log.info "using aad msi auth" + @isAADMSIAuth = true + msi_endpoint = @@imds_msi_endpoint_template % { resource: @@token_resource_audience } + end @parsed_token_uri = URI.parse(msi_endpoint) end @@ -148,8 +161,14 @@ def get_access_token @log.info "Refreshing access token for out_mdm plugin.." if (!!@useMsi) - @log.info "Using msi to get the token to post MDM data" - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", {}) + properties = {} + if (!!@isAADMSIAuth) + @log.info "Using aad msi auth to get the token to post MDM data" + properties["aadAuthMSIMode"] = "true" + else + @log.info "Using msi to get the token to post MDM data" + end + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", properties) @log.info "Opening TCP connection" http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => false) # http_access_token.use_ssl = false @@ -320,7 +339,7 @@ def send_to_mdm(post_body) ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) @last_telemetry_sent_time = Time.now end - rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException + rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException if !response.nil? && !response.body.nil? #body will have actual error @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" else diff --git a/source/plugins/utils/extension.rb b/source/plugins/utils/extension.rb new file mode 100644 index 000000000..78236fe15 --- /dev/null +++ b/source/plugins/utils/extension.rb @@ -0,0 +1,77 @@ +require "socket" +require "msgpack" +require "securerandom" +require "singleton" +require_relative "omslog" +require_relative "constants" +require_relative "ApplicationInsightsUtility" + + +class Extension + include Singleton + + def initialize + @cache = {} + @cache_lock = Mutex.new + $log.info("Extension::initialize complete") + end + + def get_output_stream_id(datatypeId) + @cache_lock.synchronize { + if @cache.has_key?(datatypeId) + return @cache[datatypeId] + else + @cache = get_config() + return @cache[datatypeId] + end + } + end + + private + def get_config() + extConfig = Hash.new + $log.info("Extension::get_config start ...") + begin + clientSocket = UNIXSocket.open(Constants::ONEAGENT_FLUENT_SOCKET_NAME) + requestId = SecureRandom.uuid.to_s + requestBodyJSON = { "Request" => "AgentTaggedData", "RequestId" => requestId, "Tag" => Constants::CI_EXTENSION_NAME, "Version" => Constants::CI_EXTENSION_VERSION }.to_json + $log.info("Extension::get_config::sending request with request body: #{requestBodyJSON}") + requestBodyMsgPack = requestBodyJSON.to_msgpack + clientSocket.write(requestBodyMsgPack) + clientSocket.flush + $log.info("reading the response from fluent socket: #{Constants::ONEAGENT_FLUENT_SOCKET_NAME}") + resp = clientSocket.recv(Constants::CI_EXTENSION_CONFIG_MAX_BYTES) + if !resp.nil? && !resp.empty? + $log.info("Extension::get_config::successfully read the extension config from fluentsocket and number of bytes read is #{resp.length}") + respJSON = JSON.parse(resp) + taggedData = respJSON["TaggedData"] + if !taggedData.nil? && !taggedData.empty? + taggedAgentData = JSON.parse(taggedData) + extensionConfigurations = taggedAgentData["extensionConfigurations"] + if !extensionConfigurations.nil? && !extensionConfigurations.empty? + extensionConfigurations.each do |extensionConfig| + outputStreams = extensionConfig["outputStreams"] + if !outputStreams.nil? && !outputStreams.empty? + outputStreams.each do |datatypeId, streamId| + $log.info("Extension::get_config datatypeId:#{datatypeId}, streamId: #{streamId}") + extConfig[datatypeId] = streamId + end + else + $log.warn("Extension::get_config::received outputStreams is either nil or empty") + end + end + else + $log.warn("Extension::get_config::received extensionConfigurations from fluentsocket is either nil or empty") + end + end + end + rescue => errorStr + $log.warn("Extension::get_config failed: #{errorStr}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + ensure + clientSocket.close unless clientSocket.nil? + end + $log.info("Extension::get_config complete ...") + return extConfig + end +end diff --git a/source/plugins/utils/extension_utils.rb b/source/plugins/utils/extension_utils.rb new file mode 100644 index 000000000..5d439c6b2 --- /dev/null +++ b/source/plugins/utils/extension_utils.rb @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "extension" + +class ExtensionUtils + class << self + def getOutputStreamId(dataType) + outputStreamId = "" + begin + if !dataType.nil? && !dataType.empty? + outputStreamId = Extension.instance.get_output_stream_id(dataType) + $log.info("ExtensionUtils::getOutputStreamId: got streamid: #{outputStreamId} for datatype: #{dataType}") + else + $log.warn("ExtensionUtils::getOutputStreamId: dataType shouldnt be nil or empty") + end + rescue => errorStr + $log.warn("ExtensionUtils::getOutputStreamId: failed with an exception: #{errorStr}") + end + return outputStreamId + end + def isAADMSIAuthMode() + return !ENV["AAD_MSI_AUTH_MODE"].nil? && !ENV["AAD_MSI_AUTH_MODE"].empty? && ENV["AAD_MSI_AUTH_MODE"].downcase == "true" + end + end +end From 13eb3a640ac094888648048e07eb01eb76a1d286 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 20 Jul 2021 11:20:34 -0700 Subject: [PATCH 129/194] Gangams/remove chart version dependency (#589) * remove chart version dependency * remove unused code * fix resource type * fix * handle weird cli chars * update release process --- ReleaseProcess.md | 29 ++++++------ .../onboarding/managed/enable-monitoring.ps1 | 23 ++++------ .../onboarding/managed/enable-monitoring.sh | 42 +++++++++++------- .../onboarding/managed/upgrade-monitoring.sh | 44 ++++++++++++------- 4 files changed, 76 insertions(+), 62 deletions(-) diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 8ec91546c..09de5e84f 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -39,48 +39,49 @@ Image automatically synched to MCR CN from Public cloud MCR. Make PR against [AKS-Engine](https://github.com/Azure/aks-engine). Refer PR https://github.com/Azure/aks-engine/pull/2318 -## Arc for Kubernetes +## Arc for Kubernetes -Ev2 pipeline used to deploy the chart of the Arc K8s Container Insights Extension as per Safe Deployment Process. +Ev2 pipeline used to deploy the chart of the Arc K8s Container Insights Extension as per Safe Deployment Process. Here is the high level process ``` 1. Specify chart version of the release candidate and trigger [container-insights-arc-k8s-extension-ci_prod-release](https://github-private.visualstudio.com/microsoft/_release?_a=releases&view=all) 2. Get the approval from one of team member for the release - 3. Once the approved, release should be triggered automatically + 3. Once the approved, release should be triggered automatically 4. use `cimon-arck8s-eastus2euap` for validating latest release in canary region 5. TBD - Notify vendor team for the validation on all Arc K8s supported platforms ``` ## Microsoft Charts Repo release for On-prem K8s +> Note: This chart repo being used in the ARO v4 onboarding script as well. -Since HELM charts repo being deprecated, Microsoft charts repo being used for HELM chart release of on-prem K8s clusters. -To make chart release PR, fork [Microsoft-charts-repo]([https://github.com/microsoft/charts/tree/gh-pages) and make the PR against `gh-pages` branch of the upstream repo. +Since HELM charts repo being deprecated, Microsoft charts repo being used for HELM chart release of on-prem K8s clusters. +To make chart release PR, fork [Microsoft-charts-repo]([https://github.com/microsoft/charts/tree/gh-pages) and make the PR against `gh-pages` branch of the upstream repo. Refer PR - https://github.com/microsoft/charts/pull/23 for example. Once the PR merged, latest version of HELM chart should be available in couple of mins in https://microsoft.github.io/charts/repo and https://artifacthub.io/. Instructions to create PR ``` -# 1. create helm package for the release candidate +# 1. create helm package for the release candidate git clone git@github.com:microsoft/Docker-Provider.git git checkout ci_prod cd ~/Docker-Provider/charts/azuremonitor-containers # this path based on where you have cloned the repo - helm package . + helm package . -# 2. clone your fork repo and checkout gh_pages branch # gh_pages branch used as release branch - cd ~ +# 2. clone your fork repo and checkout gh_pages branch # gh_pages branch used as release branch + cd ~ git clone cd ~/charts # assumed the root dir of the clone is charts git checkout gh_pages -# 3. copy release candidate helm package - cd ~/charts/repo/azuremonitor-containers +# 3. copy release candidate helm package + cd ~/charts/repo/azuremonitor-containers # update chart version value with the version of chart being released - cp ~/Docker-Provider/charts/azuremonitor-containers/azuremonitor-containers-.tgz . + cp ~/Docker-Provider/charts/azuremonitor-containers/azuremonitor-containers-.tgz . cd ~/charts/repo - # update repo index file + # update repo index file helm repo index . - + # 4. Review the changes and make PR. Please note, you may need to revert unrelated changes automatically added by `helm repo index .` command ``` diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index 828d061ac..e79ef2138 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -62,11 +62,10 @@ $isArcK8sCluster = $false $isAksCluster = $false $isUsingServicePrincipal = $false -# released chart version in mcr -$mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.8.3" -$mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" -$helmLocalRepoName = "." +# microsoft helm chart repo +$microsoftHelmRepo="https://microsoft.github.io/charts/repo" +$microsoftHelmRepoName="microsoft" + $omsAgentDomainName="opinsights.azure.com" if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { @@ -547,16 +546,12 @@ Write-Host "Helm version" : $helmVersion Write-Host("Installing or upgrading if exists, Azure Monitor for containers HELM chart ...") try { - Write-Host("pull the chart from mcr.microsoft.com") - [System.Environment]::SetEnvironmentVariable("HELM_EXPERIMENTAL_OCI", 1, "Process") - - Write-Host("pull the chart from mcr.microsoft.com") - helm chart pull ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} - - Write-Host("export the chart from local cache to current directory") - helm chart export ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} --destination . + Write-Host("Add helm chart repo- ${microsoftHelmRepoName} with repo path: ${microsoftHelmRepo}") + helm repo add ${microsoftHelmRepoName} ${microsoftHelmRepo} + Write-Host("Updating the helm chart repo- ${microsoftHelmRepoName} to get latest chart versions") + helm repo update ${microsoftHelmRepoName} - $helmChartRepoPath = "${helmLocalRepoName}" + "/" + "${helmChartName}" + $helmChartRepoPath = "${microsoftHelmRepoName}" + "/" + "${helmChartName}" Write-Host("helmChartRepoPath is : ${helmChartRepoPath}") diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index f27f944fd..588d193a3 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -43,11 +43,9 @@ defaultAzureCloud="AzureCloud" # default domain will be for public cloud omsAgentDomainName="opinsights.azure.com" -# released chart version in mcr -mcrChartVersion="2.8.3" -mcr="mcr.microsoft.com" -mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" -helmLocalRepoName="." +# microsoft helm chart repo +microsoftHelmRepo="https://microsoft.github.io/charts/repo" +microsoftHelmRepoName="microsoft" helmChartName="azuremonitor-containers" # default release name used during onboarding @@ -435,9 +433,10 @@ create_default_log_analytics_workspace() { workspaceResourceGroup="DefaultResourceGroup-"$workspaceRegionCode isRGExists=$(az group exists -g $workspaceResourceGroup) + isRGExists=$(echo $isRGExists | tr -d '"\r\n') workspaceName="DefaultWorkspace-"$subscriptionId"-"$workspaceRegionCode - if $isRGExists; then + if [ "${isRGExists}" == "true" ]; then echo "using existing default resource group:"$workspaceResourceGroup else echo "creating resource group: $workspaceResourceGroup in region: $workspaceRegion" @@ -455,7 +454,7 @@ create_default_log_analytics_workspace() { fi workspaceResourceId=$(az resource show -g $workspaceResourceGroup -n $workspaceName --resource-type $workspaceResourceProvider --query id -o json) - workspaceResourceId=$(echo $workspaceResourceId | tr -d '"') + workspaceResourceId=$(echo $workspaceResourceId | tr -d '"' | tr -d '"\r\n') echo "workspace resource Id: ${workspaceResourceId}" } @@ -495,10 +494,16 @@ install_helm_chart() { adminUserName=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminUsername' -o tsv) adminPassword=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminPassword' -o tsv) apiServer=$(az aro show -g $clusterResourceGroup -n $clusterName --query apiserverProfile.url -o tsv) + # certain az cli versions adds /r/n so trim them + adminUserName=$(echo $adminUserName | tr -d '"\r\n') + adminPassword=$(echo $adminPassword | tr -d '"\r\n') + apiServer=$(echo $apiServer | tr -d '"\r\n') echo "login to the cluster via oc login" oc login $apiServer -u $adminUserName -p $adminPassword - echo "creating project azure-monitor-for-containers" + echo "creating project: azure-monitor-for-containers" oc new-project $openshiftProjectName + echo "swicthing to project: azure-monitor-for-containers" + oc project $openshiftProjectName echo "getting config-context of aro v4 cluster" kubeconfigContext=$(oc config current-context) fi @@ -513,15 +518,7 @@ install_helm_chart() { clusterRegion=$(az resource show --ids ${clusterResourceId} --query location -o tsv) echo "cluster region is : ${clusterRegion}" - echo "pull the chart version ${mcrChartVersion} from ${mcr}/${mcrChartRepoPath}" - export HELM_EXPERIMENTAL_OCI=1 - helm chart pull $mcr/$mcrChartRepoPath:$mcrChartVersion - - echo "export the chart from local cache to current directory" - helm chart export $mcr/$mcrChartRepoPath:$mcrChartVersion --destination . - - helmChartRepoPath=$helmLocalRepoName/$helmChartName - + helmChartRepoPath=$microsoftHelmRepoName/$helmChartName echo "helm chart repo path: ${helmChartRepoPath}" if [ ! -z "$proxyEndpoint" ]; then @@ -581,6 +578,14 @@ enable_aks_monitoring_addon() { echo "status after enabling of aks monitoringa addon:$status" } +# add helm chart repo and update repo to get latest chart version +add_and_update_helm_chart_repo() { + echo "adding helm repo: ${microsoftHelmRepoName} with repo path: ${microsoftHelmRepo}" + helm repo add ${microsoftHelmRepoName} ${microsoftHelmRepo} + echo "updating helm repo: ${microsoftHelmRepoName} to get local charts updated with latest ones" + helm repo update +} + # parse and validate args parse_args $@ @@ -644,6 +649,9 @@ else attach_monitoring_tags fi +# add helm repo & update to get the latest chart version +add_and_update_helm_chart_repo + # install helm chart install_helm_chart diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 5456a7072..83643f3fa 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -19,14 +19,14 @@ set -e set -o pipefail -# released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.8.3" -mcr="mcr.microsoft.com" -mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" - +# microsoft helm chart repo +microsoftHelmRepo="https://microsoft.github.io/charts/repo" +microsoftHelmRepoName="microsoft" # default to public cloud since only supported cloud is azure public clod defaultAzureCloud="AzureCloud" -helmLocalRepoName="." +# microsoft helm chart repo +microsoftHelmRepo="https://microsoft.github.io/charts/repo" +microsoftHelmRepoName="microsoft" helmChartName="azuremonitor-containers" # default release name used during onboarding @@ -38,6 +38,9 @@ arcK8sResourceProvider="Microsoft.Kubernetes/connectedClusters" # default of resourceProvider is Azure Arc enabled Kubernetes and this will get updated based on the provider cluster resource resourceProvider="Microsoft.Kubernetes/connectedClusters" +# resource provider for azure redhat openshift v4 cluster +aroV4ResourceProvider="Microsoft.RedHatOpenShift/OpenShiftClusters" + # Azure Arc enabled Kubernetes cluster resource isArcK8sCluster=false @@ -235,10 +238,14 @@ upgrade_helm_chart_release() { adminUserName=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminUsername' -o tsv) adminPassword=$(az aro list-credentials -g $clusterResourceGroup -n $clusterName --query 'kubeadminPassword' -o tsv) apiServer=$(az aro show -g $clusterResourceGroup -n $clusterName --query apiserverProfile.url -o tsv) + # certain az cli versions adds /r/n so trim them + adminUserName=$(echo $adminUserName |tr -d '"\r\n') + adminPassword=$(echo $adminPassword |tr -d '"\r\n') + apiServer=$(echo $apiServer |tr -d '"\r\n') echo "login to the cluster via oc login" oc login $apiServer -u $adminUserName -p $adminPassword - echo "creating project azure-monitor-for-containers" - oc new-project $openshiftProjectName + echo "switching to project azure-monitor-for-containers" + oc project $openshiftProjectName echo "getting config-context of aro v4 cluster" kubeconfigContext=$(oc config current-context) fi @@ -249,15 +256,7 @@ upgrade_helm_chart_release() { echo "installing Azure Monitor for containers HELM chart on to the cluster with kubecontext:${kubeconfigContext} ..." fi - export HELM_EXPERIMENTAL_OCI=1 - - echo "pull the chart from ${mcr}/${mcrChartRepoPath}:${mcrChartVersion}" - helm chart pull ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} - - echo "export the chart from local cache to current directory" - helm chart export ${mcr}/${mcrChartRepoPath}:${mcrChartVersion} --destination . - - helmChartRepoPath=$helmLocalRepoName/$helmChartName + helmChartRepoPath=$microsoftHelmRepoName/$helmChartName echo "upgrading the release: $releaseName to chart version : ${mcrChartVersion}" helm get values $releaseName -o yaml | helm upgrade --install $releaseName $helmChartRepoPath -f - @@ -296,6 +295,14 @@ validate_and_configure_supported_cloud() { fi } +# add helm chart repo and update repo to get latest chart version +add_and_update_helm_chart_repo() { + echo "adding helm repo: ${microsoftHelmRepoName} with repo path: ${microsoftHelmRepo}" + helm repo add ${microsoftHelmRepoName} ${microsoftHelmRepo} + echo "updating helm repo: ${microsoftHelmRepoName} to get local charts updated with latest ones" + helm repo update +} + # parse and validate args parse_args $@ @@ -322,6 +329,9 @@ fi # validate the cluster has monitoring tags validate_monitoring_tags +# add helm repo & update to get the latest chart version +add_and_update_helm_chart_repo + # upgrade helm chart release upgrade_helm_chart_release From 63f22d93aa509f270459764f306372226e813926 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 22 Jul 2021 18:23:13 -0700 Subject: [PATCH 130/194] Gangams/july 2021 release tasks 3 (#613) * use artifact and pipeline creds for image push * minor update * add vuln fix here so that pr can be merged --- ...l.all_tag.all_phase.all_config.ci_prod.yml | 3 +- .pipelines/pipeline.user.linux.yml | 1 + ...l.all_tag.all_phase.all_config.ci_prod.yml | 1 + .pipelines/pipeline.user.windows.yml | 1 + .pipelines/release-agent.sh | 74 +++++++++++++++++++ kubernetes/linux/setup.sh | 3 + 6 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 .pipelines/release-agent.sh diff --git a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml index d47a60ffe..1e9909ee8 100644 --- a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml @@ -40,5 +40,6 @@ package: # to be named differently. Defaults to Dockerfile. # In effect, the -f option value passed to docker build will be repository_checkout_folder/src/DockerFinal/Foo.dockerfile. repository_name: 'cdpxlinux' # only supported ones are cdpx acr repos - tag: 'ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. + tag: 'ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index 565661d64..9977e7a1a 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -47,3 +47,4 @@ package: repository_name: 'cdpxlinux' # only supported ones are cdpx acr repos tag: 'cidev' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag diff --git a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml index e0286fbd6..8462f8e40 100644 --- a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml @@ -53,3 +53,4 @@ package: repository_name: 'cdpxwin1809' # only supported ones are cdpx acr repos tag: 'win-ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag diff --git a/.pipelines/pipeline.user.windows.yml b/.pipelines/pipeline.user.windows.yml index 2b7a54ae9..1690ad700 100644 --- a/.pipelines/pipeline.user.windows.yml +++ b/.pipelines/pipeline.user.windows.yml @@ -53,3 +53,4 @@ package: repository_name: 'cdpxwin1809' # only supported ones are cdpx acr repos tag: 'win-cidev' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag diff --git a/.pipelines/release-agent.sh b/.pipelines/release-agent.sh new file mode 100644 index 000000000..b34dd9995 --- /dev/null +++ b/.pipelines/release-agent.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Note - This script used in the pipeline as inline script + +# These are plain pipeline variable which can be modified anyone in the team +# AGENT_RELEASE=cidev +# AGENT_IMAGE_TAG_SUFFIX=07222021 + +#Name of the ACR for ciprod & cidev images +ACR_NAME=containerinsightsprod.azurecr.io +AGENT_IMAGE_FULL_PATH=${ACR_NAME}/public/azuremonitor/containerinsights/${AGENT_RELEASE}:${AGENT_RELEASE}${AGENT_IMAGE_TAG_SUFFIX} +AGENT_IMAGE_TAR_FILE_NAME=agentimage.tar.gz + +if [ -z $AGENT_IMAGE_TAG_SUFFIX ]; then + echo "-e error value of AGENT_RELEASE variable shouldnt be empty" + exit 1 +fi + +if [ -z $AGENT_RELEASE ]; then + echo "-e error AGENT_RELEASE shouldnt be empty" + exit 1 +fi + +echo "ACR NAME - ${ACR_NAME}" +echo "AGENT RELEASE - ${AGENT_RELEASE}" +echo "AGENT IMAGE TAG SUFFIX - ${AGENT_IMAGE_TAG_SUFFIX}" +echo "AGENT IMAGE FULL PATH - ${AGENT_IMAGE_FULL_PATH}" +echo "AGENT IMAGE TAR FILE PATH - ${AGENT_IMAGE_TAR_FILE_NAME}" + +echo "loading linuxagent image tarball" +IMAGE_NAME=$(docker load -i ${AGENT_IMAGE_TAR_FILE_NAME}) +echo IMAGE_NAME: $IMAGE_NAME +if [ $? -ne 0 ]; then + echo "-e error, on loading linux agent tarball from ${AGENT_IMAGE_TAR_FILE_NAME}" + echo "** Please check if this caused due to build error **" + exit 1 +else + echo "successfully loaded linux agent image tarball" +fi +# IMAGE_ID=$(docker images $IMAGE_NAME | awk '{print $3 }' | tail -1) +# echo "Image Id is : ${IMAGE_ID}" +prefix="Loadedimage:" +IMAGE_NAME=$(echo $IMAGE_NAME | tr -d '"' | tr -d "[:space:]") +IMAGE_NAME=${IMAGE_NAME/#$prefix} +echo "*** trimmed image name-:${IMAGE_NAME}" +echo "tagging the image $IMAGE_NAME as ${AGENT_IMAGE_FULL_PATH}" +# docker tag $IMAGE_NAME ${AGENT_IMAGE_FULL_PATH} +docker tag $IMAGE_NAME $AGENT_IMAGE_FULL_PATH + +if [ $? -ne 0 ]; then + echo "-e error tagging the image $IMAGE_NAME as ${AGENT_IMAGE_FULL_PATH}" + exit 1 +else + echo "successfully tagged the image $IMAGE_NAME as ${AGENT_IMAGE_FULL_PATH}" +fi + +# used pipeline identity to push the image to ciprod acr +echo "logging to acr: ${ACR_NAME}" +az acr login --name ${ACR_NAME} +if [ $? -ne 0 ]; then + echo "-e error log into acr failed: ${ACR_NAME}" + exit 1 +else + echo "successfully logged into acr:${ACR_NAME}" +fi + +echo "pushing ${AGENT_IMAGE_FULL_PATH}" +docker push ${AGENT_IMAGE_FULL_PATH} +if [ $? -ne 0 ]; then + echo "-e error on pushing the image ${AGENT_IMAGE_FULL_PATH}" + exit 1 +else + echo "Successfully pushed the image ${AGENT_IMAGE_FULL_PATH}" +fi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 933c14aed..623f33cea 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -20,6 +20,9 @@ cp -f $TMPDIR/envmdsd /etc/mdsd.d sudo apt-get update sudo apt-get install inotify-tools -y +#upgrade libsystemd0 to address CVE-2021-33910 +apt-get upgrade libsystemd0 -y + #used to parse response of kubelet apis #ref: https://packages.ubuntu.com/search?keywords=jq sudo apt-get install jq=1.5+dfsg-2 -y From 902c939562d4b573f00c7be5c8f1b5a126ca59bb Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Thu, 22 Jul 2021 21:10:30 -0700 Subject: [PATCH 131/194] remove un-used output plugin (#614) --- build/linux/installer/conf/telegraf-rs.conf | 20 -------------------- build/linux/installer/conf/telegraf.conf | 20 -------------------- 2 files changed, 40 deletions(-) diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index 0ca07f7e5..5de35d82c 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -124,26 +124,6 @@ namedrop = ["agent_telemetry", "file"] #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" - - ## Timeout for closing (default: 5s). - # timeout = "5s" - - ## Enable additional diagnostic logging. - # enable_diagnostic_logging = false - - ## Context Tag Sources add Application Insights context tags to a tag value. - ## - ## For list of allowed context tag keys see: - ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go - # [outputs.application_insights.context_tag_sources] - # "ai.cloud.role" = "kubernetes_container_name" - # "ai.cloud.roleInstance" = "kubernetes_pod_name" - namepass = ["agent_telemetry"] - #tagdrop = ["nodeName"] - ############################################################################### # PROCESSOR PLUGINS # ############################################################################### diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 8b6e2ad4b..0e4824e70 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -158,26 +158,6 @@ namepass = ["container.azm.ms/disk"] #fieldpass = ["used_percent"] -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" - - ## Timeout for closing (default: 5s). - # timeout = "5s" - - ## Enable additional diagnostic logging. - # enable_diagnostic_logging = false - - ## Context Tag Sources add Application Insights context tags to a tag value. - ## - ## For list of allowed context tag keys see: - ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go - # [outputs.application_insights.context_tag_sources] - # "ai.cloud.role" = "kubernetes_container_name" - # "ai.cloud.roleInstance" = "kubernetes_pod_name" - namepass = ["agent_telemetry"] - #tagdrop = ["nodeName"] - ############################################################################### # PROCESSOR PLUGINS # ############################################################################### From a76905a10afb0273f4ad9263e09fa3e71645d5fb Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 23 Jul 2021 09:55:40 -0700 Subject: [PATCH 132/194] fix telegraf telemetry and improve fluentd liveness (#611) * fix telegraf telemetry and improve fluentd liveness * address identified vuln with libsystemd0 * fix exported image file extension --- ...al.all_tag.all_phase.all_config.ci_prod.yml | 2 +- .pipelines/pipeline.user.windows.yml | 2 +- build/linux/installer/scripts/livenessprobe.sh | 18 +++++++++++++++++- source/plugins/go/src/oms.go | 2 ++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml index 8462f8e40..8ae069e90 100644 --- a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml @@ -53,4 +53,4 @@ package: repository_name: 'cdpxwin1809' # only supported ones are cdpx acr repos tag: 'win-ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. - export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag + export_to_artifact_path: 'agentimage.tar.zip' # path for exported image and use this instead of fixed tag diff --git a/.pipelines/pipeline.user.windows.yml b/.pipelines/pipeline.user.windows.yml index 1690ad700..82dd30cd0 100644 --- a/.pipelines/pipeline.user.windows.yml +++ b/.pipelines/pipeline.user.windows.yml @@ -53,4 +53,4 @@ package: repository_name: 'cdpxwin1809' # only supported ones are cdpx acr repos tag: 'win-cidev' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. - export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag + export_to_artifact_path: 'agentimage.tar.zip' # path for exported image and use this instead of fixed tag diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 252f471e9..8ecb7fe44 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -11,13 +11,29 @@ fi #optionally test to exit non zero value if fluentd is not running #fluentd not used in sidecar container -if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then (ps -ef | grep "fluentd" | grep -v "grep") if [ $? -ne 0 ] then echo "fluentd is not running" > /dev/termination-log exit 1 fi + # fluentd launches by default supervisor and worker process + # so adding the liveness checks individually to handle scenario if any of the process dies + # supervisor process + (ps -ef | grep "fluentd" | grep "supervisor" | grep -v "grep") + if [ $? -ne 0 ] + then + echo "fluentd supervisor is not running" > /dev/termination-log + exit 1 + fi + # worker process + (ps -ef | grep "fluentd" | grep -v "supervisor" | grep -v "grep" ) + if [ $? -ne 0 ] + then + echo "fluentd worker is not running" > /dev/termination-log + exit 1 + fi fi #test to exit non zero value if fluentbit is not running diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 0761ef664..026d36d6c 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -959,6 +959,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if er != nil { Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) if MdsdInsightsMetricsMsgpUnixSocketClient != nil { MdsdInsightsMetricsMsgpUnixSocketClient.Close() MdsdInsightsMetricsMsgpUnixSocketClient = nil @@ -970,6 +971,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int return output.FLB_RETRY } else { numTelegrafMetricsRecords := len(msgPackEntries) + UpdateNumTelegrafMetricsSentTelemetry(numTelegrafMetricsRecords, 0, 0) Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) } } From 52612b59d70629b94eb25f28149bf896d6b1e913 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 23 Jul 2021 11:54:11 -0700 Subject: [PATCH 133/194] Gangams/july 2021 release tasks 2 (#612) * tail rs mdsd err logs * configure mdsd log rotation * log rotation for mdsd log files --- .../linux/installer/conf/td-agent-bit-rs.conf | 13 +++++++ kubernetes/linux/Dockerfile | 2 +- kubernetes/linux/logrotate.conf | 39 +++++++++++++++++++ kubernetes/linux/main.sh | 6 +++ kubernetes/linux/setup.sh | 3 ++ 5 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 kubernetes/linux/logrotate.conf diff --git a/build/linux/installer/conf/td-agent-bit-rs.conf b/build/linux/installer/conf/td-agent-bit-rs.conf index 9613c270d..da3738da7 100644 --- a/build/linux/installer/conf/td-agent-bit-rs.conf +++ b/build/linux/installer/conf/td-agent-bit-rs.conf @@ -23,6 +23,19 @@ Skip_Long_Lines On Ignore_Older 2m +[INPUT] + Name tail + Tag oms.container.log.flbplugin.mdsd.* + Path /var/opt/microsoft/linuxmonagent/log/mdsd.err + Read_from_Head true + DB /var/opt/microsoft/docker-cimprov/state/mdsd-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 1ae7bef61..b47841757 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV KUBE_CLIENT_BACKOFF_BASE 1 ENV KUBE_CLIENT_BACKOFF_DURATION 0 ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* -COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd $tmpdir/ +COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ WORKDIR ${tmpdir} # copy docker provider shell bundle to use the agent image diff --git a/kubernetes/linux/logrotate.conf b/kubernetes/linux/logrotate.conf new file mode 100644 index 000000000..921371fd0 --- /dev/null +++ b/kubernetes/linux/logrotate.conf @@ -0,0 +1,39 @@ +/var/opt/microsoft/linuxmonagent/log/mdsd.err { + copytruncate + rotate 7 + missingok + notifempty + delaycompress + compress + size 10M +} + +/var/opt/microsoft/linuxmonagent/log/mdsd.warn { + copytruncate + rotate 7 + missingok + notifempty + delaycompress + compress + size 10M +} + +/var/opt/microsoft/linuxmonagent/log/mdsd.info { + copytruncate + rotate 7 + missingok + notifempty + delaycompress + compress + size 10M +} + +/var/opt/microsoft/linuxmonagent/log/mdsd.qos { + copytruncate + rotate 7 + missingok + notifempty + delaycompress + compress + size 10M +} diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 428e6f35a..4579787b3 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -521,6 +521,12 @@ else mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & fi +# Set up a cron job for logrotation +if [ ! -f /etc/cron.d/ci-agent ]; then + echo "setting up cronjob for ci agent log rotation" + echo "*/5 * * * * root /usr/sbin/logrotate -s /var/lib/logrotate/ci-agent-status /etc/logrotate.d/ci-agent >/dev/null 2>&1" > /etc/cron.d/ci-agent +fi + # no dependency on fluentd for prometheus side car container if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -e "/etc/config/kube.conf" ]; then diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 623f33cea..51e5f9efb 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -16,6 +16,9 @@ wget https://github.com/microsoft/Docker-Provider/releases/download/06242021-one cp -f $TMPDIR/mdsd.xml /etc/mdsd.d cp -f $TMPDIR/envmdsd /etc/mdsd.d +#log rotate conf for mdsd and can be extended for other log files as well +cp -f $TMPDIR/logrotate.conf /etc/logrotate.d/ci-agent + #download inotify tools for watching configmap changes sudo apt-get update sudo apt-get install inotify-tools -y From 5b5d048fdb662ed8427b7825c159feafba3cbcac Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 13 Aug 2021 08:57:29 -0700 Subject: [PATCH 134/194] Fix out_oms.go dependency vulnerabilities (#623) --- .github/workflows/pr-checker.yml | 2 +- source/plugins/go/src/go.mod | 28 +- source/plugins/go/src/go.sum | 490 +++++++++++++++++++++++++++---- source/plugins/go/src/oms.go | 4 +- 4 files changed, 437 insertions(+), 87 deletions(-) diff --git a/.github/workflows/pr-checker.yml b/.github/workflows/pr-checker.yml index f3bdb27e8..bae117dbe 100644 --- a/.github/workflows/pr-checker.yml +++ b/.github/workflows/pr-checker.yml @@ -56,7 +56,7 @@ jobs: format: 'table' severity: 'CRITICAL,HIGH' vuln-type: 'os,library' - skip-dirs: 'opt/telegraf' + skip-dirs: 'opt/telegraf,usr/sbin/telegraf' exit-code: '1' timeout: '5m0s' WINDOWS-build: diff --git a/source/plugins/go/src/go.mod b/source/plugins/go/src/go.mod index db29a0553..4ead145ac 100644 --- a/source/plugins/go/src/go.mod +++ b/source/plugins/go/src/go.mod @@ -3,33 +3,17 @@ module Docker-Provider/source/plugins/go/src go 1.14 require ( - code.cloudfoundry.org/clock v1.0.1-0.20200131002207-86534f4ca3a5 // indirect github.com/Azure/azure-kusto-go v0.3.2 github.com/Azure/go-autorest/autorest/azure/auth v0.4.2 + github.com/Azure/go-autorest/autorest/to v0.4.0 // indirect + github.com/dnaeon/go-vcr v1.2.0 // indirect github.com/fluent/fluent-bit-go v0.0.0-20171103221316-c4a158a6e3a7 - github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680 // indirect - github.com/gogo/protobuf v0.0.0-20170330071051-c0656edd0d9e // indirect - github.com/golang/glog v0.0.0-20141105023935-44145f04b68c // indirect - github.com/google/btree v0.0.0-20160524151835-7d79101e329e // indirect - github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367 // indirect - github.com/google/uuid v1.1.1 - github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d // indirect - github.com/gregjones/httpcache v0.0.0-20170728041850-787624de3eb7 // indirect - github.com/json-iterator/go v0.0.0-20180612202835-f2b4162afba3 // indirect + github.com/google/uuid v1.1.2 github.com/microsoft/ApplicationInsights-Go v0.4.3 - github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da // indirect - github.com/peterbourgon/diskv v2.0.1+incompatible // indirect - github.com/philhofer/fwd v1.0.0 // indirect - github.com/satori/go.uuid v1.2.1-0.20181028125025-b2ce2384e17b // indirect + github.com/philhofer/fwd v1.1.1 // indirect github.com/tinylib/msgp v1.1.2 github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5 - golang.org/x/net v0.0.0-20200421231249-e086a090c8fd // indirect - golang.org/x/time v0.0.0-20161028155119-f51c12702a4d // indirect - gopkg.in/inf.v0 v0.9.0 // indirect gopkg.in/natefinch/lumberjack.v2 v2.0.0-20170531160350-a96e63847dc3 - k8s.io/api v0.0.0-20180628040859-072894a440bd // indirect - k8s.io/apimachinery v0.0.0-20180621070125-103fd098999d - k8s.io/client-go v8.0.0+incompatible - golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f + k8s.io/apimachinery v0.21.0 + k8s.io/client-go v0.21.0 ) diff --git a/source/plugins/go/src/go.sum b/source/plugins/go/src/go.sum index 7e8b3d765..7f93bb260 100644 --- a/source/plugins/go/src/go.sum +++ b/source/plugins/go/src/go.sum @@ -1,10 +1,28 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= +cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= +cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= +cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= +cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= +cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= +cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= +cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= +cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= +cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= +cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= +cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= +cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= +cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= +cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= +cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= +cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= +cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= +cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= +cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= +code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c h1:5eeuG0BHx1+DHeT3AP+ISKZ2ht1UjGhm581ljqYpVeQ= code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c/go.mod h1:QD9Lzhd/ux6eNQVUDVRJX/RKTigpewimNYBi7ivZKY8= -code.cloudfoundry.org/clock v1.0.1-0.20200131002207-86534f4ca3a5 h1:LTlZ2AD8IV/d1JRzB+HHfZfF1M+K8lyOlN28zDEpw7U= -code.cloudfoundry.org/clock v1.0.1-0.20200131002207-86534f4ca3a5/go.mod h1:QD9Lzhd/ux6eNQVUDVRJX/RKTigpewimNYBi7ivZKY8= -github.com/Azure/azure-kusto-go v0.1.3 h1:0u+YqfIvwj5PHd+moXwtlxVePt8xTLU1ixM8Q6PjJ3o= -github.com/Azure/azure-kusto-go v0.1.3/go.mod h1:55hwXJ3PaahmWZFP7VC4+PlgsSUuetSA30rFtYFabfc= -github.com/Azure/azure-kusto-go v0.1.4-0.20200427191510-041d4ed55f86 h1:vyhCediIKg1gZ9H/kMcutU8F8BFNhxLk76Gti8UAOzo= -github.com/Azure/azure-kusto-go v0.1.4-0.20200427191510-041d4ed55f86/go.mod h1:55hwXJ3PaahmWZFP7VC4+PlgsSUuetSA30rFtYFabfc= +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/Azure/azure-kusto-go v0.3.2 h1:XpS9co6GvEDl2oICF9HsjEsQVwEpRK6wbNWb9Z+uqsY= github.com/Azure/azure-kusto-go v0.3.2/go.mod h1:wd50n4qlsSxh+G4f80t+Fnl2ShK9AcXD+lMOstiKuYo= github.com/Azure/azure-pipeline-go v0.1.8/go.mod h1:XA1kFWRVhSK+KNFiOhfv83Fv8L9achrP7OxIzeTn1Yg= @@ -16,18 +34,21 @@ github.com/Azure/azure-storage-blob-go v0.8.0 h1:53qhf0Oxa0nOjgbDeeYPUeyiNmafAFE github.com/Azure/azure-storage-blob-go v0.8.0/go.mod h1:lPI3aLPpuLTeUwh1sViKXFxwl2B6teiRqI0deQUvsw0= github.com/Azure/azure-storage-queue-go v0.0.0-20191125232315-636801874cdd h1:b3wyxBl3vvr15tUAziPBPK354y+LSdfPCpex5oBttHo= github.com/Azure/azure-storage-queue-go v0.0.0-20191125232315-636801874cdd/go.mod h1:K6am8mT+5iFXgingS9LUc7TmbsW6XBw3nxaRyaMyWc8= -github.com/Azure/go-autorest v1.1.1 h1:4G9tVCqooRY3vDTB2bA1Z01PlSALtnUbji0AfzthUSs= -github.com/Azure/go-autorest v14.1.1+incompatible h1:m2F62e1Zk5DV3HENGdH/wEuzvJZIynHG4fHF7oiQwgE= +github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= +github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/Azure/go-autorest/autorest v0.9.0/go.mod h1:xyHB1BMZT0cuDHU7I0+g046+BFDTQ8rEZB0s4Yfa6bI= github.com/Azure/go-autorest/autorest v0.9.3/go.mod h1:GsRuLYvwzLjjjRoWEIyMUaYq8GNUx2nRB378IPt/1p0= github.com/Azure/go-autorest/autorest v0.10.0 h1:mvdtztBqcL8se7MdrUweNieTNi4kfNG6GOJuurQJpuY= github.com/Azure/go-autorest/autorest v0.10.0/go.mod h1:/FALq9T/kS7b5J5qsQ+RSTUdAmGFqi0vUdVNNx8q630= -github.com/Azure/go-autorest/autorest v0.10.2 h1:NuSF3gXetiHyUbVdneJMEVyPUYAe5wh+aN08JYAf1tI= +github.com/Azure/go-autorest/autorest v0.11.12 h1:gI8ytXbxMfI+IVbI9mP2JGCTXIuhHLgRlvQ9X4PsnHE= +github.com/Azure/go-autorest/autorest v0.11.12/go.mod h1:eipySxLmqSyC5s5k1CLupqet0PSENBEDP93LQ9a8QYw= github.com/Azure/go-autorest/autorest/adal v0.5.0/go.mod h1:8Z9fGy2MpX0PvDjB1pEgQTmVqjGhiHBW7RJJEciWzS0= github.com/Azure/go-autorest/autorest/adal v0.8.0/go.mod h1:Z6vX6WXXuyieHAXwMj0S6HY6e6wcHn37qQMBQlvY3lc= github.com/Azure/go-autorest/autorest/adal v0.8.1/go.mod h1:ZjhuQClTqx435SRJ2iMlOxPYt3d2C/T/7TiQCVZSn3Q= github.com/Azure/go-autorest/autorest/adal v0.8.2 h1:O1X4oexUxnZCaEUGsvMnr8ZGj8HI37tNezwY4npRqA0= github.com/Azure/go-autorest/autorest/adal v0.8.2/go.mod h1:ZjhuQClTqx435SRJ2iMlOxPYt3d2C/T/7TiQCVZSn3Q= +github.com/Azure/go-autorest/autorest/adal v0.9.5 h1:Y3bBUV4rTuxenJJs41HU3qmqsb+auo+a3Lz+PlJPpL0= +github.com/Azure/go-autorest/autorest/adal v0.9.5/go.mod h1:B7KF7jKIeC9Mct5spmyCB/A8CG/sEz1vwIRGv/bbw7A= github.com/Azure/go-autorest/autorest/azure/auth v0.4.2 h1:iM6UAvjR97ZIeR93qTcwpKNMpV+/FTWjwEbuPD495Tk= github.com/Azure/go-autorest/autorest/azure/auth v0.4.2/go.mod h1:90gmfKdlmKgfjUpnCEpOJzsUEjrWDSLwHIG73tSXddM= github.com/Azure/go-autorest/autorest/azure/cli v0.3.1 h1:LXl088ZQlP0SBppGFsRZonW6hSvwgL5gRByMbvUbx8U= @@ -35,128 +56,471 @@ github.com/Azure/go-autorest/autorest/azure/cli v0.3.1/go.mod h1:ZG5p860J94/0kI9 github.com/Azure/go-autorest/autorest/date v0.1.0/go.mod h1:plvfp3oPSKwf2DNjlBjWF/7vwR+cUD/ELuzDCXwHUVA= github.com/Azure/go-autorest/autorest/date v0.2.0 h1:yW+Zlqf26583pE43KhfnhFcdmSWlm5Ew6bxipnr/tbM= github.com/Azure/go-autorest/autorest/date v0.2.0/go.mod h1:vcORJHLJEh643/Ioh9+vPmf1Ij9AEBM5FuBIXLmIy0g= +github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= +github.com/Azure/go-autorest/autorest/date v0.3.0/go.mod h1:BI0uouVdmngYNUzGWeSYnokU+TrmwEsOqdt8Y6sso74= github.com/Azure/go-autorest/autorest/mocks v0.1.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0= github.com/Azure/go-autorest/autorest/mocks v0.2.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0= github.com/Azure/go-autorest/autorest/mocks v0.3.0/go.mod h1:a8FDP3DYzQ4RYfVAxAN3SVSiiO77gL2j2ronKKP0syM= +github.com/Azure/go-autorest/autorest/mocks v0.4.1 h1:K0laFcLE6VLTOwNgSxaGbUcLPuGXlNkbVvq4cW4nIHk= +github.com/Azure/go-autorest/autorest/mocks v0.4.1/go.mod h1:LTp+uSrOhSkaKrUy935gNZuuIPPVsHlr9DSOxSayd+k= +github.com/Azure/go-autorest/autorest/to v0.4.0 h1:oXVqrxakqqV1UZdSazDOPOLvOIz+XA683u8EctwboHk= +github.com/Azure/go-autorest/autorest/to v0.4.0/go.mod h1:fE8iZBn7LQR7zH/9XU2NcPR4o9jEImooCeWJcYV/zLE= github.com/Azure/go-autorest/logger v0.1.0 h1:ruG4BSDXONFRrZZJ2GUXDiUyVpayPmb1GnWeHDdaNKY= github.com/Azure/go-autorest/logger v0.1.0/go.mod h1:oExouG+K6PryycPJfVSxi/koC6LSNgds39diKLz7Vrc= +github.com/Azure/go-autorest/logger v0.2.0 h1:e4RVHVZKC5p6UANLJHkM4OfR1UKZPj8Wt8Pcx+3oqrE= +github.com/Azure/go-autorest/logger v0.2.0/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8= github.com/Azure/go-autorest/tracing v0.5.0 h1:TRn4WjSnkcSy5AEG3pnbtFSwNtwzjr4VYyQflFE619k= github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbtp2fGCgRFtBroKn4Dk= -github.com/Microsoft/ApplicationInsights-Go v0.4.2 h1:HIZoGXMiKNwAtMAgCSSX35j9mP+DjGF9ezfBvxMDLLg= -github.com/Microsoft/ApplicationInsights-Go v0.4.2/go.mod h1:CukZ/G66zxXtI+h/VcVn3eVVDGDHfXM2zVILF7bMmsg= +github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= +github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= +github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= +github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dimchansky/utfbom v1.1.0 h1:FcM3g+nofKgUteL8dm/UpdRXNC9KmADgTpLKsu0TRo4= github.com/dimchansky/utfbom v1.1.0/go.mod h1:rO41eb7gLfo8SF1jd9F8HplJm1Fewwi4mQvIirEdv+8= +github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI= +github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= +github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= +github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fluent/fluent-bit-go v0.0.0-20171103221316-c4a158a6e3a7 h1:mck6KdLX2FTh2/ZD27dK69ehWDZR4hCk+nLf+HvAbDk= github.com/fluent/fluent-bit-go v0.0.0-20171103221316-c4a158a6e3a7/go.mod h1:JVF1Nl3QOPpKTR8xDjhkm0xINYUX0z4XdJvOpIUF+Eo= +github.com/form3tech-oss/jwt-go v3.2.2+incompatible h1:TcekIExNqud5crz4xD2pavyTgWiPvpYe4Xau31I0PRk= +github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= +github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680 h1:ZktWZesgun21uEDrwW7iEV1zPCGQldM2atlJZ3TdvVM= -github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= -github.com/gogo/protobuf v0.0.0-20170330071051-c0656edd0d9e h1:ago6fNuQ6IhszPsXkeU7qRCyfsIX7L67WDybsAPkLl8= -github.com/gogo/protobuf v0.0.0-20170330071051-c0656edd0d9e/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= -github.com/golang/glog v0.0.0-20141105023935-44145f04b68c h1:CbdkBQ1/PiAo0FYJhQGwASD8wrgNvTdf01g6+O9tNuA= -github.com/golang/glog v0.0.0-20141105023935-44145f04b68c/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/protobuf v1.1.0 h1:0iH4Ffd/meGoXqF2lSAhZHt8X+cPgkfn/cb6Cce5Vpc= -github.com/golang/protobuf v1.1.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= +github.com/go-logr/logr v0.4.0 h1:K7/B1jt6fIBQVd4Owv2MqGQClcgf0R266+7C/QjRcLc= +github.com/go-logr/logr v0.4.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU= +github.com/go-openapi/jsonpointer v0.19.2/go.mod h1:3akKfEdA7DF1sugOqz1dVQHBcuDBPKZGEoHC/NkiQRg= +github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonreference v0.19.2/go.mod h1:jMjeRr2HHw6nAVajTXJ4eiUwohSTlpa0o73RUL1owJc= +github.com/go-openapi/jsonreference v0.19.3/go.mod h1:rjx6GuL8TTa9VaixXglHmQmIL98+wF9xc8zWvFonSJ8= +github.com/go-openapi/spec v0.19.3/go.mod h1:FpwSN1ksY1eteniUU7X0N/BgJ7a4WvBFVA8Lj9mJglo= +github.com/go-openapi/swag v0.19.2/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= +github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/google/btree v0.0.0-20160524151835-7d79101e329e h1:JHB7F/4TJCrYBW8+GZO8VkWDj1jxcWuCl6uxKODiyi4= -github.com/google/btree v0.0.0-20160524151835-7d79101e329e/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367 h1:ScAXWS+TR6MZKex+7Z8rneuSJH+FSDqd6ocQyl+ZHo4= -github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.3 h1:JjCZWpVbqXDqFVmTfYWEVTMIYrL/NPdPSCHPJ0T/raM= +github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= +github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= +github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d h1:7XGaL1e6bYS1yIonGp9761ExpPPV1ui0SAC59Yube9k= -github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= -github.com/gregjones/httpcache v0.0.0-20170728041850-787624de3eb7 h1:6TSoaYExHper8PYsJu23GWVNOyYRCSnIFyxKgLSZ54w= -github.com/gregjones/httpcache v0.0.0-20170728041850-787624de3eb7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= +github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/googleapis/gnostic v0.4.1 h1:DLJCy1n/vrD4HPjOvYcT8aYQXpPIzoRZONaYwyycI+I= +github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg= +github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= -github.com/json-iterator/go v0.0.0-20180612202835-f2b4162afba3 h1:/UewZcckqhvnnS0C6r3Sher2hSEbVmM6Ogpcjen08+Y= -github.com/json-iterator/go v0.0.0-20180612202835-f2b4162afba3/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= +github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= +github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= +github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= +github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68= +github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mattn/go-ieproxy v0.0.0-20190610004146-91bb50d98149 h1:HfxbT6/JcvIljmERptWhwa8XzP7H3T+Z2N26gTsaDaA= github.com/mattn/go-ieproxy v0.0.0-20190610004146-91bb50d98149/go.mod h1:31jz6HNzdxOmlERGGEc4v/dMssOfmp2p5bT/okiKFFc= -github.com/microsoft/ApplicationInsights-Go v0.4.2 h1:LCv4NtCpXpsUF6ZUzZdpVG2x4RwebY7tiJUb25uYXiM= -github.com/microsoft/ApplicationInsights-Go v0.4.2/go.mod h1:DupRHRNoeuH4j8Yv3nux9/IXo3HZ0kO5A1ykNK4vR2E= github.com/microsoft/ApplicationInsights-Go v0.4.3 h1:gBuy5rM3o6Zo69QTkq1Ens8wx6sVf+mpgMjjfayiRcw= github.com/microsoft/ApplicationInsights-Go v0.4.3/go.mod h1:ih0t3h84PdzV1qGeUs89o9wL8eCuwf24M7TZp/nyqXk= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da h1:ZQGIPjr1iTtUPXZFk8WShqb5G+Qg65VHFLtSvmHh+Mw= -github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= +github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modocache/gover v0.0.0-20171022184752-b58185e213c5/go.mod h1:caMODM3PzxT8aQXRPkAt8xlV/e7d7w8GM5g0fa5F0D8= +github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.8.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.11.0 h1:JAKSXpt1YjtLA7YpPiqO9ss6sNXEsPfSGdwN0UHqzrw= +github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.5.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/onsi/gomega v1.7.0 h1:XPnZz8VVBHjVsy1vzJmRwIcSwiUO+JFfrv/xGiigmME= +github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= -github.com/philhofer/fwd v1.0.0 h1:UbZqGr5Y38ApvM/V/jEljVxwocdweyH+vmYvRPBnbqQ= -github.com/philhofer/fwd v1.0.0/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU= +github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ= +github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= -github.com/satori/go.uuid v1.2.1-0.20181028125025-b2ce2384e17b h1:gQZ0qzfKHQIybLANtM3mBXNUtOfsCFXeTsnBqCsx1KM= -github.com/satori/go.uuid v1.2.1-0.20181028125025-b2ce2384e17b/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= +github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc h1:LUUe4cdABGrIJAhl1P1ZpWY76AwukVszFdwkVFVLwIk= github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc/go.mod h1:eyZnKCc955uh98WQvzOm0dgAeLnf2O0Rz0LPoC5ze+0= github.com/tinylib/msgp v1.1.2 h1:gWmO7n0Ys2RBEb7GPYB9Ujq8Mk5p2U08lRnmMcGy6BQ= github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5 h1:JRe7Bc0YQq+x7Bm3p/LIBIb4aopsdr3H0KRKRI8g6oY= github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5/go.mod h1:hnLbHMwcvSihnDhEfx2/BzKp2xb0Y+ErdfYcrs9tkJQ= -golang.org/x/crypto v0.0.0-20180222182404-49796115aa4b h1:/GxqO8gbyb+sNnviFY2IIMrtm8vGg6NEJDft68wJY/g= -golang.org/x/crypto v0.0.0-20180222182404-49796115aa4b/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= +go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413 h1:ULYEB3JvPRE/IfO+9uO7vKV/xzVTO7XPAwm8xbf4w2g= golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975 h1:/Tl7pH94bvbAAHBdZJT947M/+gp0+CqQXDtMRC0fseo= -golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f h1:aZp0e2vLN4MToVqnjNEYEtrEA8RH8U8FN1CU7JgqsPU= -golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= -golang.org/x/net v0.0.0-20170809000501-1c05540f6879 h1:0rFa7EaCGdQPmZVbo9F7MNF65b8dyzS6EUnXjs9Cllk= -golang.org/x/net v0.0.0-20170809000501-1c05540f6879/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83 h1:/ZScEX8SfEmUGRHs0gxpqteO5nfNW6axyZbBdw9A12g= +golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= +golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= +golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= +golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200421231249-e086a090c8fd h1:QPwSajcTUrFriMF1nJ3XzgoqakqQEsnZf9LdXdi2nkI= -golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210224082022-3d97a244fca7 h1:OgUuv8lsRpBibGNbSizVwKWlysjaNzmC9gYMhPVfqFM= +golang.org/x/net v0.0.0-20210224082022-3d97a244fca7/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d h1:TzXSXBo42m9gQenoE3b9BGiEpg5IG2JkU5FkPIawgtw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20171031081856-95c657629925 h1:nCH33NboKIsT4HoXBsXTWX8ul303HxWgkc5s2Ezwacg= -golang.org/x/sys v0.0.0-20171031081856-95c657629925/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073 h1:8qxJSnu+7dRq6upnbntrmriWByIakBuct5OM/MdQC1M= +golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221 h1:/ZHdbVpdR/jk3g30/d4yUL0JU9kksj8+F/bnQUVLGDM= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= -golang.org/x/text v0.0.0-20170810154203-b19bf474d317 h1:WKW+OPdYPlvOTVGHuMfjnIC6yY2SI93yFB0pZ7giBmQ= -golang.org/x/text v0.0.0-20170810154203-b19bf474d317/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d h1:SZxvLBoTP5yHO3Frd4z4vrF+DBX9vMVanchswa69toE= +golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/time v0.0.0-20161028155119-f51c12702a4d h1:TnM+PKb3ylGmZvyPXmo9m/wktg7Jn/a/fNmr33HSj8g= -golang.org/x/time v0.0.0-20161028155119-f51c12702a4d/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.4 h1:0YWbFKbhXG/wIiuHDSKpS0Iy7FSA+u45VtBMfQcFTTc= +golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba h1:O8mE0/t419eoIwhTFpKVkHiTs/Igowgfkj25AcZrtiE= +golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190614205625-5aca471b1d59/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= +google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +google.golang.org/appengine v1.6.5 h1:tycE03LOZYQNhDpS27tcQdAzLCVMaj7QT2SXxebnpCM= +google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= +google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= +google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.25.0 h1:Ejskq+SyPohKW+1uil0JJMtmHCgJPJ/qWTxr8qp+R4c= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= -gopkg.in/inf.v0 v0.9.0 h1:3zYtXIO92bvsdS3ggAdA8Gb4Azj0YU+TVY1uGYNFA8o= -gopkg.in/inf.v0 v0.9.0/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.0.0-20170531160350-a96e63847dc3 h1:AFxeG48hTWHhDTQDk/m2gorfVHUEa9vo3tp3D7TzwjI= gopkg.in/natefinch/lumberjack.v2 v2.0.0-20170531160350-a96e63847dc3/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/yaml.v2 v2.0.0-20170721113624-670d4cfef054 h1:ROF+R/wHHruzF40n5DfPv2jwm7rCJwvs8fz+RTZWjLE= -gopkg.in/yaml.v2 v2.0.0-20170721113624-670d4cfef054/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -k8s.io/api v0.0.0-20180628040859-072894a440bd h1:HzgYeLDS1jLxw8DGr68KJh9cdQ5iZJizG0HZWstIhfQ= -k8s.io/api v0.0.0-20180628040859-072894a440bd/go.mod h1:iuAfoD4hCxJ8Onx9kaTIt30j7jUFS00AXQi6QMi99vA= -k8s.io/apimachinery v0.0.0-20180621070125-103fd098999d h1:MZjlsu9igBoVPZkXpIGoxI6EonqNsXXZU7hhvfQLkd4= -k8s.io/apimachinery v0.0.0-20180621070125-103fd098999d/go.mod h1:ccL7Eh7zubPUSh9A3USN90/OzHNSVN6zxzde07TDCL0= -k8s.io/client-go v8.0.0+incompatible h1:tTI4hRmb1DRMl4fG6Vclfdi6nTM82oIrTT7HfitmxC4= -k8s.io/client-go v8.0.0+incompatible/go.mod h1:7vJpHMYJwNQCWgzmNV+VYUl1zCObLyodBc8nIyt8L5s= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +k8s.io/api v0.21.0 h1:gu5iGF4V6tfVCQ/R+8Hc0h7H1JuEhzyEi9S4R5LM8+Y= +k8s.io/api v0.21.0/go.mod h1:+YbrhBBGgsxbF6o6Kj4KJPJnBmAKuXDeS3E18bgHNVU= +k8s.io/apimachinery v0.21.0 h1:3Fx+41if+IRavNcKOz09FwEXDBG6ORh6iMsTSelhkMA= +k8s.io/apimachinery v0.21.0/go.mod h1:jbreFvJo3ov9rj7eWT7+sYiRx+qZuCYXwWT1bcDswPY= +k8s.io/client-go v0.21.0 h1:n0zzzJsAQmJngpC0IhgFcApZyoGXPrDIAD601HD09ag= +k8s.io/client-go v0.21.0/go.mod h1:nNBytTF9qPFDEhoqgEPaarobC8QPae13bElIVHzIglA= +k8s.io/gengo v0.0.0-20200413195148-3a45101e95ac/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= +k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= +k8s.io/klog/v2 v2.8.0 h1:Q3gmuM9hKEjefWFFYF0Mat+YyFJvsUyYuwyNNJ5C9Ts= +k8s.io/klog/v2 v2.8.0/go.mod h1:hy9LJ/NvuK+iVyP4Ehqva4HxZG/oXyIS3n3Jmire4Ec= +k8s.io/kube-openapi v0.0.0-20210305001622-591a79e4bda7/go.mod h1:wXW5VT87nVfh/iLV8FpR2uDvrFyomxbtb1KivDbvPTE= +k8s.io/utils v0.0.0-20201110183641-67b214c5f920 h1:CbnUZsM497iRC5QMVkHwyl8s2tB3g7yaSHkYPkpgelw= +k8s.io/utils v0.0.0-20201110183641-67b214c5f920/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= +rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= +rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= +sigs.k8s.io/structured-merge-diff/v4 v4.0.2/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw= +sigs.k8s.io/structured-merge-diff/v4 v4.1.0 h1:C4r9BgJ98vrKnnVCjwCSXcWjWe0NKcUQkmzDXZXGwH8= +sigs.k8s.io/structured-merge-diff/v4 v4.1.0/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw= +sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q= +sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 026d36d6c..a2937073b 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -417,7 +417,9 @@ func updateContainerImageNameMaps() { listOptions := metav1.ListOptions{} listOptions.FieldSelector = fmt.Sprintf("spec.nodeName=%s", Computer) - pods, err := ClientSet.CoreV1().Pods("").List(listOptions) + + // Context was added as a parameter, but we want the same behavior as before: see https://pkg.go.dev/context#TODO + pods, err := ClientSet.CoreV1().Pods("").List(context.TODO(), listOptions) if err != nil { message := fmt.Sprintf("Error getting pods %s\nIt is ok to log here and continue, because the logs will be missing image and Name, but the logs will still have the containerID", err.Error()) From 2a0f4ecb0be63af173c7072c9f0543a380a065bd Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 13 Aug 2021 09:58:59 -0700 Subject: [PATCH 135/194] revert libsystemd0 update (#616) --- kubernetes/linux/setup.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 51e5f9efb..b7cddffbc 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -23,9 +23,6 @@ cp -f $TMPDIR/logrotate.conf /etc/logrotate.d/ci-agent sudo apt-get update sudo apt-get install inotify-tools -y -#upgrade libsystemd0 to address CVE-2021-33910 -apt-get upgrade libsystemd0 -y - #used to parse response of kubelet apis #ref: https://packages.ubuntu.com/search?keywords=jq sudo apt-get install jq=1.5+dfsg-2 -y From 45f35aeb44f16f9f5de1e83c541f1a1ffd1a42e5 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 13 Aug 2021 13:36:03 -0700 Subject: [PATCH 136/194] updates for ci-prod release instructions (#619) --- ReleaseProcess.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ReleaseProcess.md b/ReleaseProcess.md index 09de5e84f..7bd858561 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -13,14 +13,12 @@ Here are the high-level instructions to get the CIPROD`

` image for 2. Make PR to ci_dev branch and once the PR approved, merge the changes to ci_dev 3. Latest bits of ci_dev automatically deployed to CIDEV cluster in build subscription so just validated E2E to make sure everthing works 4. If everything validated in DEV, make merge PR from ci_dev and ci_prod and merge once this reviewed by dev team -6. Update following pipeline variables under ReleaseCandiate with version of chart and image tag - - CIHELMCHARTVERSION # For example, 2.7.4 - - CIImageTagSuffix # ciprod08072020 or ciprod08072020-1 etc. -7. Merge ci_dev and ci_prod branch which will trigger automatic deployment of latest bits to CIPROD cluster with CIPROD`
` image to test and scale cluters, AKS, AKS-Engine - > Note: production image automatically pushed to CIPROD Public cloud ACR which will inturn replicated to Public cloud MCR. +5. Once the PR to ci_prod approved, please go-ahead and merge, and wait for ci_prod build successfully completed +6. Once the merged PR build successfully completed, update the value of AGENT_IMAGE_TAG_SUFFIX pipeline variable by editing the Release [ci-prod-release](https://github-private.visualstudio.com/microsoft/_release?_a=releases&view=mine&definitionId=38) + > Note - value format of AGENT_IMAGE_TAG_SUFFIX pipeline should be in `
` for our releases +7. Create a release by selecting the targetted build version of the _docker-provider_Official-ci_prod release 8. Validate all the scenarios against clusters in build subscription and scale clusters - # 2. Perf and scale testing Deploy latest omsagent yaml with release candidate agent image in to supported k8s versions and validate all the critical scenarios. In perticular, throughly validate the updates going as part of this release and also make sure no regressions. If this passes, deploy onto scale cluster and validate perf and scale aspects. Scale cluster in AME cloud and co-ordinate with agent team who has access to this cluster to deploy the release candiate onto this cluster. From 10b2ea63d84acf3415fae8d290bd43b1f002f764 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 13 Aug 2021 14:05:08 -0700 Subject: [PATCH 137/194] cherry pick changes from ci_prod (#622) --- ReleaseNotes.md | 13 +++ kubernetes/linux/Dockerfile | 2 +- kubernetes/linux/setup.sh | 11 ++- kubernetes/omsagent.yaml | 20 ++--- source/plugins/go/src/oms.go | 168 +++++++++++++++++------------------ 5 files changed, 115 insertions(+), 99 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 0c51b737c..dc42e7d51 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,19 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 08/05/2021 - +##### Version microsoft/oms:ciprod08052021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021 (linux) +##### Code change log +- Linux Agent + - Fix for CPU spike which occurrs at around 6.30am UTC on every day because of unattended package upgrades + - Update MDSD build which has fixes for the following issues + - Undeterministic Core dump issue because of the non 200 status code and runtime exception stack unwindings + - Reduce the verbosity of the error logs for OMS & ODS code paths. + - Increase Timeout for OMS Homing service API calls from 30s to 60s + - Fix for https://github.com/Azure/AKS/issues/2457 + - In replicaset, tailing of the mdsd.err log file to agent telemetry + + ### 07/13/2021 - ##### Version microsoft/oms:win-ciprod06112021-2 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021-2 (windows) ##### Code change log diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index b47841757..07af7f4a7 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod06112021 +ARG IMAGE_TAG=ciprod08052021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index b7cddffbc..df32afc7e 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -9,8 +9,8 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -#install oneagent - Official bits (06/24/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/06242021-oneagent/azure-mdsd_1.10.3-build.master.241_x86_64.deb +#install oneagent - Official bits (08/04/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/08042021-oneagent/azure-mdsd_1.10.1-build.master.251_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d @@ -47,8 +47,8 @@ sudo apt-get update sudo apt-get install td-agent-bit=1.6.8 -y # install ruby2.6 -sudo apt-get install software-properties-common -y -sudo apt-add-repository ppa:brightbox/ruby-ng -y +sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F5DA5F09C3173AA6 +sudo echo "deb http://ppa.launchpad.net/brightbox/ruby-ng/ubuntu bionic main" >> /etc/apt/sources.list sudo apt-get update sudo apt-get install ruby2.6 ruby2.6-dev gcc make -y # fluentd v1 gem @@ -62,6 +62,9 @@ rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml rm -f $TMPDIR/envmdsd +# remove build dependencies +sudo apt-get remove ruby2.6-dev gcc make -y + # Remove settings for cron.daily that conflict with the node's cron.daily. Since both are trying to rotate the same files # in /var/log at the same time, the rotation doesn't happen correctly and then the *.1 file is forever logged to. rm /etc/logrotate.d/alternatives /etc/logrotate.d/apt /etc/logrotate.d/azure-mdsd /etc/logrotate.d/rsyslog diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 855f3a8e1..49d4586c1 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -362,13 +362,13 @@ spec: schema-versions: "v1" spec: serviceAccountName: omsagent - dnsConfig: + dnsConfig: options: - name: ndots - value: "3" + value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021" imagePullPolicy: IfNotPresent resources: limits: @@ -384,7 +384,7 @@ spec: - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - - name: ISTEST + - name: ISTEST value: "true" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME @@ -446,7 +446,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021" imagePullPolicy: IfNotPresent resources: limits: @@ -589,7 +589,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021" imagePullPolicy: IfNotPresent resources: limits: @@ -604,8 +604,8 @@ spec: - name: AKS_REGION value: "VALUE_AKS_RESOURCE_REGION_VALUE" # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - - name: ISTEST - value: "true" + - name: ISTEST + value: "true" # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" @@ -754,10 +754,10 @@ spec: schema-versions: "v1" spec: serviceAccountName: omsagent - dnsConfig: + dnsConfig: options: - name: ndots - value: "3" + value: "3" containers: - name: omsagent-win image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021-2" diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index a2937073b..91a5b4b40 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -165,17 +165,17 @@ var ( // ADX tenantID AdxTenantID string //ADX client secret - AdxClientSecret string + AdxClientSecret string // container log or container log v2 tag name for oneagent route - MdsdContainerLogTagName string + MdsdContainerLogTagName string // kubemonagent events tag name for oneagent route MdsdKubeMonAgentEventsTagName string // InsightsMetrics tag name for oneagent route - MdsdInsightsMetricsTagName string + MdsdInsightsMetricsTagName string // flag to check if its Windows OS IsWindows bool - // container type - ContainerType string + // container type + ContainerType string // flag to check whether LA AAD MSI Auth Enabled or not IsAADMSIAuthMode bool ) @@ -206,7 +206,7 @@ var ( // IngestionAuthTokenUpdateMutex read and write mutex access for ODSIngestionAuthToken IngestionAuthTokenUpdateMutex = &sync.Mutex{} // ODSIngestionAuthToken for windows agent AAD MSI Auth - ODSIngestionAuthToken string + ODSIngestionAuthToken string ) var ( @@ -355,12 +355,12 @@ const ( ) // DataType to be used as enum per data type socket client creation -type DataType int +type DataType int const ( // DataType to be used as enum per data type socket client creation ContainerLogV2 DataType = iota - KubeMonAgentEvents - InsightsMetrics + KubeMonAgentEvents + InsightsMetrics ) func createLogger() *log.Logger { @@ -610,7 +610,7 @@ func flushKubeMonAgentEventRecords() { Message: k, Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) var stringMap map[string]string jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) if err != nil { @@ -623,10 +623,10 @@ func flushKubeMonAgentEventRecords() { Log(message) SendException(message) } else { - msgPackEntry := MsgPackEntry{ + msgPackEntry := MsgPackEntry{ Record: stringMap, } - msgPackEntries = append(msgPackEntries, msgPackEntry) + msgPackEntries = append(msgPackEntries, msgPackEntry) } } } @@ -649,23 +649,23 @@ func flushKubeMonAgentEventRecords() { Message: k, Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) var stringMap map[string]string jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) if err != nil { message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) Log(message) SendException(message) - } else { - if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + } else { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { message := fmt.Sprintf("Error while UnMarhalling json bytes to stringmap: %s", err.Error()) Log(message) SendException(message) } else { - msgPackEntry := MsgPackEntry{ + msgPackEntry := MsgPackEntry{ Record: stringMap, - } - msgPackEntries = append(msgPackEntries, msgPackEntry) + } + msgPackEntries = append(msgPackEntries, msgPackEntry) } } } @@ -698,66 +698,66 @@ func flushKubeMonAgentEventRecords() { Message: "No errors", Tags: fmt.Sprintf("%s", tagJson), } - laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) + laKubeMonAgentEventsRecords = append(laKubeMonAgentEventsRecords, laKubeMonAgentEventsRecord) var stringMap map[string]string jsonBytes, err := json.Marshal(&laKubeMonAgentEventsRecord) - if err != nil { + if err != nil { message := fmt.Sprintf("Error while Marshalling laKubeMonAgentEventsRecord to json bytes: %s", err.Error()) Log(message) SendException(message) } else { - if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { + if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { message := fmt.Sprintf("Error while UnMarshalling json bytes to stringmap: %s", err.Error()) Log(message) SendException(message) - } else { - msgPackEntry := MsgPackEntry{ + } else { + msgPackEntry := MsgPackEntry{ Record: stringMap, } - msgPackEntries = append(msgPackEntries, msgPackEntry) + msgPackEntries = append(msgPackEntries, msgPackEntry) } } } } - if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdKubeMonAgentEventsTagName, MdsdOutputStreamIdTagPrefix) == false { Log("Info::mdsd::obtaining output stream id for data type: %s", KubeMonAgentEventDataType) MdsdKubeMonAgentEventsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(KubeMonAgentEventDataType) - } + } Log("Info::mdsd:: using mdsdsource name for KubeMonAgentEvents: %s", MdsdKubeMonAgentEventsTagName) - msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdKubeMonAgentEventsTagName, msgPackEntries) + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdKubeMonAgentEventsTagName, msgPackEntries) if MdsdKubeMonMsgpUnixSocketClient == nil { Log("Error::mdsd::mdsd connection for KubeMonAgentEvents does not exist. re-connecting ...") CreateMDSDClient(KubeMonAgentEvents, ContainerType) if MdsdKubeMonMsgpUnixSocketClient == nil { - Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") + Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() - KubeMonEventsMDSDClientCreateErrors += 1 - } + KubeMonEventsMDSDClientCreateErrors += 1 + } } - if MdsdKubeMonMsgpUnixSocketClient != nil { + if MdsdKubeMonMsgpUnixSocketClient != nil { deadline := 10 * time.Second - MdsdKubeMonMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + MdsdKubeMonMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse bts, er := MdsdKubeMonMsgpUnixSocketClient.Write(msgpBytes) - elapsed = time.Since(start) + elapsed = time.Since(start) if er != nil { message := fmt.Sprintf("Error::mdsd::Failed to write to kubemonagent mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) Log(message) if MdsdKubeMonMsgpUnixSocketClient != nil { MdsdKubeMonMsgpUnixSocketClient.Close() MdsdKubeMonMsgpUnixSocketClient = nil - } + } SendException(message) } else { numRecords := len(msgPackEntries) Log("FlushKubeMonAgentEventRecords::Info::Successfully flushed %d records that was %d bytes in %s", numRecords, bts, elapsed) // Send telemetry to AppInsights resource SendEvent(KubeMonAgentEventsFlushedEvent, telemetryDimensions) - } + } } else { - Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") - } + Log("Error::mdsd::Unable to create mdsd client for KubeMonAgentEvents. Please check error log.") + } } else if len(laKubeMonAgentEventsRecords) > 0 { //for windows, ODS direct kubeMonAgentEventEntry := KubeMonAgentEventBlob{ DataType: KubeMonAgentEventDataType, @@ -784,10 +784,10 @@ func flushKubeMonAgentEventRecords() { if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() ingestionAuthToken := ODSIngestionAuthToken - IngestionAuthTokenUpdateMutex.Unlock() - if ingestionAuthToken == "" { - Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") - } + IngestionAuthTokenUpdateMutex.Unlock() + if ingestionAuthToken == "" { + Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") + } req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) } @@ -900,15 +900,15 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int message := fmt.Sprintf("PostTelegrafMetricsToLA::Info:derived %v metrics from %v timeseries", len(laMetrics), len(telegrafRecords)) Log(message) } - + if IsWindows == false { //for linux, mdsd route - var msgPackEntries []MsgPackEntry + var msgPackEntries []MsgPackEntry var i int start := time.Now() var elapsed time.Duration - for i = 0; i < len(laMetrics); i++ { - var interfaceMap map[string]interface{} + for i = 0; i < len(laMetrics); i++ { + var interfaceMap map[string]interface{} stringMap := make(map[string]string) jsonBytes, err := json.Marshal(*laMetrics[i]) if err != nil { @@ -917,35 +917,35 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int SendException(message) return output.FLB_OK } else { - if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { + if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { message := fmt.Sprintf("Error while UnMarshalling json bytes to interfaceMap: %s", err.Error()) Log(message) SendException(message) return output.FLB_OK - } else { + } else { for key, value := range interfaceMap { strKey := fmt.Sprintf("%v", key) strValue := fmt.Sprintf("%v", value) stringMap[strKey] = strValue - } - msgPackEntry := MsgPackEntry{ + } + msgPackEntry := MsgPackEntry{ Record: stringMap, } - msgPackEntries = append(msgPackEntries, msgPackEntry) - } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } } } - if (len(msgPackEntries) > 0) { + if (len(msgPackEntries) > 0) { if IsAADMSIAuthMode == true && (strings.HasPrefix(MdsdInsightsMetricsTagName, MdsdOutputStreamIdTagPrefix) == false) { Log("Info::mdsd::obtaining output stream id for InsightsMetricsDataType since Log Analytics AAD MSI Auth Enabled") MdsdInsightsMetricsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(InsightsMetricsDataType) - } - msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) + } + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) if MdsdInsightsMetricsMsgpUnixSocketClient == nil { Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") CreateMDSDClient(InsightsMetrics, ContainerType) if MdsdInsightsMetricsMsgpUnixSocketClient == nil { - Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") + Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() InsightsMetricsMDSDClientCreateErrors += 1 @@ -954,7 +954,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int } deadline := 10 * time.Second - MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse bts, er := MdsdInsightsMetricsMsgpUnixSocketClient.Write(msgpBytes) elapsed = time.Since(start) @@ -969,7 +969,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() - InsightsMetricsMDSDClientCreateErrors += 1 + InsightsMetricsMDSDClientCreateErrors += 1 return output.FLB_RETRY } else { numTelegrafMetricsRecords := len(msgPackEntries) @@ -977,7 +977,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) } } - + } else { // for windows, ODS direct var metrics []laTelegrafMetric @@ -1019,9 +1019,9 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() ingestionAuthToken := ODSIngestionAuthToken - IngestionAuthTokenUpdateMutex.Unlock() - if ingestionAuthToken == "" { - message := "Error::ODS Ingestion Auth Token is empty. Please check error log." + IngestionAuthTokenUpdateMutex.Unlock() + if ingestionAuthToken == "" { + message := "Error::ODS Ingestion Auth Token is empty. Please check error log." Log(message) return output.FLB_RETRY } @@ -1232,7 +1232,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords := 0 if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { - //flush to mdsd + //flush to mdsd if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdContainerLogTagName, MdsdOutputStreamIdTagPrefix) == false { Log("Info::mdsd::obtaining output stream id") if ContainerLogSchemaV2 == true { @@ -1242,7 +1242,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } Log("Info::mdsd:: using mdsdsource name: %s", MdsdContainerLogTagName) } - + fluentForward := MsgPackForward{ Tag: MdsdContainerLogTagName, Entries: msgPackEntries, @@ -1359,7 +1359,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = len(dataItemsADX) Log("Success::ADX::Successfully wrote %d container log records to ADX in %s", numContainerLogRecords, elapsed) - } else { //ODS + } else if ((ContainerLogSchemaV2 == true && len(dataItemsLAv2) > 0) || len(dataItemsLAv1) > 0) { //ODS var logEntry interface{} recordType := "" loglinesCount := 0 @@ -1401,19 +1401,19 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if ResourceCentric == true { req.Header.Set("x-ms-AzureResourceId", ResourceID) } - + if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() ingestionAuthToken := ODSIngestionAuthToken IngestionAuthTokenUpdateMutex.Unlock() - if ingestionAuthToken == "" { - Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") + if ingestionAuthToken == "" { + Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") return output.FLB_RETRY } // add authorization header to the req req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) - } - + } + resp, err := HTTPClient.Do(req) elapsed = time.Since(start) @@ -1422,7 +1422,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { Log(message) // Commenting this out for now. TODO - Add better telemetry for ods errors using aggregation //SendException(message) - + Log("Failed to flush %d records after %s", loglinesCount, elapsed) return output.FLB_RETRY @@ -1510,7 +1510,7 @@ func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, str } // InitializePlugin reads and populates plugin configuration -func InitializePlugin(pluginConfPath string, agentVersion string) { +func InitializePlugin(pluginConfPath string, agentVersion string) { go func() { isTest := os.Getenv("ISTEST") if strings.Compare(strings.ToLower(strings.TrimSpace(isTest)), "true") == 0 { @@ -1550,10 +1550,10 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } ContainerType = os.Getenv(ContainerTypeEnv) - Log("Container Type %s", ContainerType) + Log("Container Type %s", ContainerType) osType := os.Getenv("OS_TYPE") - IsWindows = false + IsWindows = false // Linux if strings.Compare(strings.ToLower(osType), "windows") != 0 { Log("Reading configuration for Linux from %s", pluginConfPath) @@ -1572,7 +1572,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { SendException(message) time.Sleep(30 * time.Second) log.Fatalln(message) - } + } OMSEndpoint = "https://" + WorkspaceID + ".ods." + LogAnalyticsWorkspaceDomain + "/OperationalData.svc/PostJsonDataItems" // Populate Computer field containerHostName, err1 := ioutil.ReadFile(pluginConfig["container_host_file_path"]) @@ -1602,7 +1602,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } } else { // windows - IsWindows = true + IsWindows = true Computer = os.Getenv("HOSTNAME") WorkspaceID = os.Getenv("WSID") logAnalyticsDomain := os.Getenv("DOMAIN") @@ -1614,7 +1614,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { IsAADMSIAuthMode = false if strings.Compare(strings.ToLower(os.Getenv(AADMSIAuthMode)), "true") == 0 { IsAADMSIAuthMode = true - Log("AAD MSI Auth Mode Configured") + Log("AAD MSI Auth Mode Configured") } ResourceID = os.Getenv(envAKSResourceID) @@ -1689,13 +1689,13 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log(message) } - PluginConfiguration = pluginConfig + PluginConfiguration = pluginConfig ContainerLogsRoute := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOGS_ROUTE"))) Log("AZMON_CONTAINER_LOGS_ROUTE:%s", ContainerLogsRoute) - ContainerLogsRouteV2 = false - ContainerLogsRouteADX = false + ContainerLogsRouteV2 = false + ContainerLogsRouteADX = false if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { //check if adx clusteruri, clientid & secret are set @@ -1728,14 +1728,14 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Routing container logs thru %s route...", ContainerLogsADXRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route...\n", ContainerLogsADXRoute) } - } else if strings.Compare(strings.ToLower(osType), "windows") != 0 { //for linux, oneagent will be default route + } else if strings.Compare(strings.ToLower(osType), "windows") != 0 { //for linux, oneagent will be default route ContainerLogsRouteV2 = true //default is mdsd route - if strings.Compare(ContainerLogsRoute, ContainerLogsV1Route) == 0 { + if strings.Compare(ContainerLogsRoute, ContainerLogsV1Route) == 0 { ContainerLogsRouteV2 = false //fallback option when hiddensetting set } Log("Routing container logs thru %s route...", ContainerLogsRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsRoute) - } + } if ContainerLogsRouteV2 == true { CreateMDSDClient(ContainerLogV2, ContainerType) @@ -1748,7 +1748,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { if IsWindows == false { // mdsd linux specific Log("Creating MDSD clients for KubeMonAgentEvents & InsightsMetrics") - CreateMDSDClient(KubeMonAgentEvents, ContainerType) + CreateMDSDClient(KubeMonAgentEvents, ContainerType) CreateMDSDClient(InsightsMetrics, ContainerType) } @@ -1787,7 +1787,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName - MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName Log("ContainerLogsRouteADX: %v, IsWindows: %v, IsAADMSIAuthMode = %v \n", ContainerLogsRouteADX, IsWindows, IsAADMSIAuthMode) if !ContainerLogsRouteADX && IsWindows && IsAADMSIAuthMode { Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) From ad31c55dbc49bc49bfe3cb18bb1d44fbda974947 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sat, 14 Aug 2021 01:18:23 +0100 Subject: [PATCH 138/194] Support az login for passwords starting with dash ('-') (#626) Co-authored-by: Vladimir Babichev --- scripts/onboarding/managed/disable-monitoring.sh | 2 +- scripts/onboarding/managed/enable-monitoring.sh | 2 +- scripts/onboarding/managed/upgrade-monitoring.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/onboarding/managed/disable-monitoring.sh b/scripts/onboarding/managed/disable-monitoring.sh index 29b755331..40b0793bc 100644 --- a/scripts/onboarding/managed/disable-monitoring.sh +++ b/scripts/onboarding/managed/disable-monitoring.sh @@ -116,7 +116,7 @@ remove_monitoring_tags() if [ "$isUsingServicePrincipal" = true ] ; then echo "login to the azure using provided service principal creds" - az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId + az login --service-principal --username="$servicePrincipalClientId" --password="$servicePrincipalClientSecret" --tenant="$servicePrincipalTenantId" else echo "login to the azure interactively" az login --use-device-code diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 588d193a3..5fc241517 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -547,7 +547,7 @@ install_helm_chart() { login_to_azure() { if [ "$isUsingServicePrincipal" = true ]; then echo "login to the azure using provided service principal creds" - az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId + az login --service-principal --username="$servicePrincipalClientId" --password="$servicePrincipalClientSecret" --tenant="$servicePrincipalTenantId" else echo "login to the azure interactively" az login --use-device-code diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 83643f3fa..edd48c938 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -266,7 +266,7 @@ upgrade_helm_chart_release() { login_to_azure() { if [ "$isUsingServicePrincipal" = true ]; then echo "login to the azure using provided service principal creds" - az login --service-principal --username $servicePrincipalClientId --password $servicePrincipalClientSecret --tenant $servicePrincipalTenantId + az login --service-principal --username="$servicePrincipalClientId" --password="$servicePrincipalClientSecret" --tenant="$servicePrincipalTenantId" else echo "login to the azure interactively" az login --use-device-code From 57beb59f38de4626b6f635a430b1e1bfa5d656ff Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 17 Aug 2021 15:15:03 -0700 Subject: [PATCH 139/194] Gangams/add telemetry fbit settings (#628) * add telemetry to track fbit settings * add telemetry to track fbit settings --- source/plugins/go/src/telemetry.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index debe003e4..31818dbb3 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -145,8 +145,8 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { + telemetryDimensions := make(map[string]string) if strings.Compare(strings.ToLower(os.Getenv("CONTAINER_TYPE")), "prometheussidecar") == 0 { - telemetryDimensions := make(map[string]string) telemetryDimensions["CustomPromMonitorPods"] = promMonitorPods if promMonitorPodsNamespaceLength > 0 { telemetryDimensions["CustomPromMonitorPodsNamespaceLength"] = strconv.Itoa(promMonitorPodsNamespaceLength) @@ -168,7 +168,23 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) } else { - SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) + fbitFlushIntervalSecs := os.Getenv("FBIT_SERVICE_FLUSH_INTERVAL") + if fbitFlushIntervalSecs != "" { + telemetryDimensions["FbitServiceFlushIntervalSecs"] = fbitFlushIntervalSecs + } + fbitTailBufferChunkSizeMBs := os.Getenv("FBIT_TAIL_BUFFER_CHUNK_SIZE") + if fbitTailBufferChunkSizeMBs != "" { + telemetryDimensions["FbitBufferChunkSizeMBs"] = fbitTailBufferChunkSizeMBs + } + fbitTailBufferMaxSizeMBs := os.Getenv("FBIT_TAIL_BUFFER_MAX_SIZE") + if fbitTailBufferMaxSizeMBs != "" { + telemetryDimensions["FbitBufferMaxSizeMBs"] = fbitTailBufferMaxSizeMBs + } + fbitTailMemBufLimitMBs := os.Getenv("FBIT_TAIL_MEM_BUF_LIMIT") + if fbitTailMemBufLimitMBs != "" { + telemetryDimensions["FbitMemBufLimitSizeMBs"] = fbitTailMemBufLimitMBs + } + SendEvent(eventNameDaemonSetHeartbeat, telemetryDimensions) flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) TelemetryClient.Track(flushRateMetric) logRateMetric := appinsights.NewMetricTelemetry(metricNameAvgLogGenerationRate, logRate) From cf4775a802ca8c7c3aac451274459236e2b79c47 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 19 Aug 2021 15:05:53 -0700 Subject: [PATCH 140/194] check onboarding status (#629) --- kubernetes/linux/main.sh | 47 +++++++++++++++++++++++++++++++++++++++ kubernetes/linux/setup.sh | 2 +- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 4579787b3..4986e3113 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -38,6 +38,51 @@ waitforlisteneronTCPport() { fi } +checkAgentOnboardingStatus() { + local sleepdurationsecs=1 + local totalsleptsecs=0 + local isaadmsiauthmode=$1 + local waittimesecs=$2 + local numeric='^[0-9]+$' + + if [ -z "$1" ] || [ -z "$2" ]; then + echo "${FUNCNAME[0]} called with incorrect arguments<$1 , $2>. Required arguments <#isaadmsiauthmode, #wait-time-in-seconds>" + return -1 + else + + if [[ $waittimesecs =~ $numeric ]]; then + successMessage="Onboarding success" + failureMessage="Failed to register certificate with OMS Homing service, giving up" + if [ "${isaadmsiauthmode}" == "true" ]; then + successMessage="Loaded data sources" + failureMessage="Failed to load data sources into config" + fi + while true + do + if [ $totalsleptsecs -gt $waittimesecs ]; then + echo "${FUNCNAME[0]} giving up checking agent onboarding status after $totalsleptsecs secs" + return 1 + fi + + if grep "$successMessage" "${MDSD_LOG}/mdsd.info"; then + echo "Onboarding success" + return 0 + elif grep "$failureMessage" "${MDSD_LOG}/mdsd.err"; then + echo "Onboarding Failure: Reason: Failed to onboard the agent" + echo "Onboarding Failure: Please verify log analytics workspace configuration such as existence of the workspace, workspace key and workspace enabled for public ingestion" + return 1 + fi + sleep $sleepdurationsecs + totalsleptsecs=$(($totalsleptsecs+1)) + done + else + echo "${FUNCNAME[0]} called with non-numeric arguments<$2>. Required arguments <#wait-time-in-seconds>" + return -1 + fi + fi +} + + #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state @@ -672,6 +717,8 @@ service rsyslog stop echo "getting rsyslog status..." service rsyslog status +checkAgentOnboardingStatus $AAD_MSI_AUTH_MODE 30 + shutdown() { pkill -f mdsd } diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index df32afc7e..c14007d35 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -10,7 +10,7 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ update-locale LANG=en_US.UTF-8 #install oneagent - Official bits (08/04/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/08042021-oneagent/azure-mdsd_1.10.1-build.master.251_x86_64.deb +wget https://github.com/microsoft/Docker-Provider/releases/download/06242021-oneagent/azure-mdsd_1.10.3-build.master.257_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d From da55fe53612aa9900331b6a0c798ea7f46d1fbf1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 19 Aug 2021 15:06:32 -0700 Subject: [PATCH 141/194] Gangams/arc k8s conformance test updates (#617) * conf test updates * clean up * wip * update with mcr cidev image * handle log path * cleanup * clean up * wip * working * update for mcr image * minor * image update * handle latency of connected cluster resource creation * update conftest image --- README.md | 2 +- test/e2e/conformance.yaml | 15 ++ test/e2e/e2e-tests.yaml | 21 +- test/e2e/src/common/constants.py | 6 +- test/e2e/src/core/Dockerfile | 17 +- test/e2e/src/core/conftest.py | 38 +-- test/e2e/src/core/e2e_tests.sh | 200 ++++++++++++++- test/e2e/src/core/setup_failure_handler.py | 18 ++ test/e2e/src/tests/test_ds_workflows.py | 28 ++- test/e2e/src/tests/test_e2e_workflows.py | 231 +++++++++--------- .../tests/test_node_metrics_e2e_workflow.py | 66 ++--- .../tests/test_pod_metrics_e2e_workflow.py | 15 +- test/e2e/src/tests/test_resource_status.py | 13 +- test/e2e/src/tests/test_rs_workflows.py | 18 +- 14 files changed, 500 insertions(+), 188 deletions(-) create mode 100644 test/e2e/conformance.yaml create mode 100644 test/e2e/src/core/setup_failure_handler.py diff --git a/README.md b/README.md index 73bf858cd..e3ceedc8e 100644 --- a/README.md +++ b/README.md @@ -326,7 +326,7 @@ For DEV and PROD branches, automatically deployed latest yaml with latest agent docker build -f ./core/Dockerfile -t /: . docker push /: ``` -3. update existing agentest image tag in e2e-tests.yaml with newly built image tag with MCR repo +3. update existing agentest image tag in e2e-tests.yaml & conformance.yaml with newly built image tag with MCR repo # Scenario Tests Clusters are used in release pipeline already has the yamls under test\scenario deployed. Make sure to validate these scenarios. diff --git a/test/e2e/conformance.yaml b/test/e2e/conformance.yaml new file mode 100644 index 000000000..ff790e690 --- /dev/null +++ b/test/e2e/conformance.yaml @@ -0,0 +1,15 @@ +sonobuoy-config: + driver: Job + plugin-name: azure-arc-ci-conformance + result-format: junit +spec: + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest08142021 + imagePullPolicy: Always + name: plugin + resources: {} + volumes: + - name: results + emptyDir: {} + volumeMounts: + - mountPath: /tmp/results + name: results diff --git a/test/e2e/e2e-tests.yaml b/test/e2e/e2e-tests.yaml index 06dfa1fb0..25817be12 100644 --- a/test/e2e/e2e-tests.yaml +++ b/test/e2e/e2e-tests.yaml @@ -68,7 +68,7 @@ data: containers: [] restartPolicy: Never serviceAccountName: sonobuoy-serviceaccount - nodeSelector: + nodeSelector: kubernetes.io/os: linux tolerations: - effect: NoSchedule @@ -84,8 +84,11 @@ data: result-format: junit spec: env: + # this should be false if the test environment is non ARC K8s for example AKS + - name: IS_NON_ARC_K8S_TEST_ENVIRONMENT + value: "true" # Update values of CLIENT_ID, CLIENT_SECRET of the service principal which has permission to query LA ad Metrics API - # Update value of TENANT_ID corresponding your Azure Service principal + # Update value of TENANT_ID corresponding your Azure Service principal - name: CLIENT_ID value: "SP_CLIENT_ID_VALUE" - name: CLIENT_SECRET @@ -93,15 +96,15 @@ data: - name: TENANT_ID value: "SP_TENANT_ID_VALUE" - name: DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES - value: "10" + value: "10" - name: DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES - value: "10" + value: "10" - name: AGENT_POD_EXPECTED_RESTART_COUNT - value: "0" + value: "0" - name: AZURE_CLOUD - value: "AZURE_PUBLIC_CLOUD" - # image tag should be updated if new tests being added after this image - image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciagenttest02152021 + value: "AZURE_PUBLIC_CLOUD" + # image tag should be updated if new tests being added after this image + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest08142021 imagePullPolicy: IfNotPresent name: plugin resources: {} @@ -144,7 +147,7 @@ spec: name: output-volume restartPolicy: Never serviceAccountName: sonobuoy-serviceaccount - nodeSelector: + nodeSelector: kubernetes.io/os: linux tolerations: - key: "kubernetes.io/e2e-evict-taint-key" diff --git a/test/e2e/src/common/constants.py b/test/e2e/src/common/constants.py index 770964cb5..392b10554 100644 --- a/test/e2e/src/common/constants.py +++ b/test/e2e/src/common/constants.py @@ -40,6 +40,8 @@ TIMEOUT = 300 +# WAIT TIME BEFORE READING THE AGENT LOGS +AGENT_WAIT_TIME_SECS = "180" # Azure Monitor for Container Extension related AGENT_RESOURCES_NAMESPACE = 'kube-system' AGENT_DEPLOYMENT_NAME = 'omsagent-rs' @@ -47,7 +49,9 @@ AGENT_WIN_DAEMONSET_NAME = 'omsagent-win' AGENT_DEPLOYMENT_PODS_LABEL_SELECTOR = 'rsName=omsagent-rs' -AGENT_DAEMON_SET_PODS_LABEL_SELECTOR = 'component=oms-agent' +AGENT_DAEMON_SET_PODS_LABEL_SELECTOR = 'dsName=omsagent-ds' +AGENT_DAEMON_SET_PODS_LABEL_SELECTOR_NON_ARC = 'component=oms-agent' +AGENT_FLUENTD_LOG_PATH = '/var/opt/microsoft/docker-cimprov/log/fluentd.log' AGENT_OMSAGENT_LOG_PATH = '/var/opt/microsoft/omsagent/log/omsagent.log' AGENT_REPLICASET_WORKFLOWS = ["kubePodInventoryEmitStreamSuccess", "kubeNodeInventoryEmitStreamSuccess"] diff --git a/test/e2e/src/core/Dockerfile b/test/e2e/src/core/Dockerfile index 9f85bdf4c..cd85aee40 100644 --- a/test/e2e/src/core/Dockerfile +++ b/test/e2e/src/core/Dockerfile @@ -1,11 +1,26 @@ FROM python:3.6 -RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org pytest pytest-xdist filelock requests kubernetes adal msrestazure +RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org pytest pytest-xdist filelock requests kubernetes adal msrestazure RUN curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \ && helm version +RUN apt-get update && apt-get -y upgrade && \ + apt-get -f -y install curl apt-transport-https lsb-release gnupg python3-pip python-pip && \ + curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > /etc/apt/trusted.gpg.d/microsoft.asc.gpg && \ + CLI_REPO=$(lsb_release -cs) && \ + echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ ${CLI_REPO} main" \ + > /etc/apt/sources.list.d/azure-cli.list && \ + apt-get update && \ + apt-get install -y azure-cli && \ + rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install junit_xml + +COPY --from=lachlanevenson/k8s-kubectl:v1.20.5 /usr/local/bin/kubectl /usr/local/bin/kubectl + COPY ./core/e2e_tests.sh / +COPY ./core/setup_failure_handler.py / COPY ./core/pytest.ini /e2etests/ COPY ./core/conftest.py /e2etests/ COPY ./core/helper.py /e2etests/ diff --git a/test/e2e/src/core/conftest.py b/test/e2e/src/core/conftest.py index e659d5189..02f644a18 100644 --- a/test/e2e/src/core/conftest.py +++ b/test/e2e/src/core/conftest.py @@ -22,42 +22,48 @@ def env_dict(): create_results_dir('/tmp/results') # Setting some environment variables - env_dict['SETUP_LOG_FILE'] = '/tmp/results/setup' + env_dict['SETUP_LOG_FILE'] = '/tmp/results/setup' env_dict['TEST_AGENT_LOG_FILE'] = '/tmp/results/containerinsights' env_dict['NUM_TESTS_COMPLETED'] = 0 - + print("Starting setup...") append_result_output("Starting setup...\n", env_dict['SETUP_LOG_FILE']) - + # Collecting environment variables env_dict['TENANT_ID'] = os.getenv('TENANT_ID') env_dict['CLIENT_ID'] = os.getenv('CLIENT_ID') env_dict['CLIENT_SECRET'] = os.getenv('CLIENT_SECRET') - + env_dict['IS_NON_ARC_K8S_TEST_ENVIRONMENT'] = os.getenv('IS_NON_ARC_K8S_TEST_ENVIRONMENT') + # released agent for Arc K8s still uses omsagent and when we rollout the agent with mdsd + # this shouldnt set after agent rollout with mdsd + env_dict['USING_OMSAGENT_BASE_AGENT'] = os.getenv('USING_OMSAGENT_BASE_AGENT') + + waitTimeInterval = int(os.getenv('AGENT_WAIT_TIME_SECS')) if os.getenv('AGENT_WAIT_TIME_SECS') else constants.AGENT_WAIT_TIME_SECS + env_dict['AGENT_WAIT_TIME_SECS'] = waitTimeInterval # get default query time interval for log analytics queries queryTimeInterval = int(os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES # add minute suffix since this format required for LA queries env_dict['DEFAULT_QUERY_TIME_INTERVAL_IN_MINUTES'] = str(queryTimeInterval) + "m" - + # get default query time interval for metrics queries env_dict['DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES'] = int(os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES')) if os.getenv('DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES') else constants.DEFAULT_METRICS_QUERY_TIME_INTERVAL_IN_MINUTES - - - # expected agent pod restart count + + + # expected agent pod restart count env_dict['AGENT_POD_EXPECTED_RESTART_COUNT'] = int(os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT')) if os.getenv('AGENT_POD_EXPECTED_RESTART_COUNT') else constants.AGENT_POD_EXPECTED_RESTART_COUNT # default to azure public cloud if AZURE_CLOUD not specified env_dict['AZURE_ENDPOINTS'] = constants.AZURE_CLOUD_DICT.get(os.getenv('AZURE_CLOUD')) if os.getenv('AZURE_CLOUD') else constants.AZURE_PUBLIC_CLOUD_ENDPOINTS - + if not env_dict.get('TENANT_ID'): pytest.fail('ERROR: variable TENANT_ID is required.') - + if not env_dict.get('CLIENT_ID'): pytest.fail('ERROR: variable CLIENT_ID is required.') - + if not env_dict.get('CLIENT_SECRET'): pytest.fail('ERROR: variable CLIENT_SECRET is required.') - + print("Setup Complete.") append_result_output("Setup Complete.\n", env_dict['SETUP_LOG_FILE']) @@ -66,22 +72,22 @@ def env_dict(): else: with Path.open(my_file, "rb") as f: env_dict = pickle.load(f) - + yield env_dict - + my_file = Path("env.pkl") with FileLock(str(my_file) + ".lock"): with Path.open(my_file, "rb") as f: env_dict = pickle.load(f) env_dict['NUM_TESTS_COMPLETED'] = 1 + env_dict.get('NUM_TESTS_COMPLETED') - if env_dict['NUM_TESTS_COMPLETED'] == int(os.getenv('NUM_TESTS')): + if env_dict['NUM_TESTS_COMPLETED'] == int(os.getenv('NUM_TESTS')): # Checking if cleanup is required. if os.getenv('SKIP_CLEANUP'): return print('Starting cleanup...') append_result_output("Starting Cleanup...\n", env_dict['SETUP_LOG_FILE']) - + print("Cleanup Complete.") append_result_output("Cleanup Complete.\n", env_dict['SETUP_LOG_FILE']) return diff --git a/test/e2e/src/core/e2e_tests.sh b/test/e2e/src/core/e2e_tests.sh index 3bfafdce9..dd9d93073 100644 --- a/test/e2e/src/core/e2e_tests.sh +++ b/test/e2e/src/core/e2e_tests.sh @@ -1,7 +1,158 @@ -#!/bin/sh +#!/bin/bash +set -x results_dir="${RESULTS_DIR:-/tmp/results}" +waitForResourcesReady() { + ready=false + max_retries=60 + sleep_seconds=10 + NAMESPACE=$1 + RESOURCETYPE=$2 + RESOURCE=$3 + # if resource not specified, set to --all + if [ -z $RESOURCE ]; then + RESOURCE="--all" + fi + for i in $(seq 1 $max_retries) + do + if [[ ! $(kubectl wait --for=condition=Ready ${RESOURCETYPE} ${RESOURCE} --namespace ${NAMESPACE}) ]]; then + echo "waiting for the resource:${RESOURCE} of the type:${RESOURCETYPE} in namespace:${NAMESPACE} to be ready state, iteration:${i}" + sleep ${sleep_seconds} + else + echo "resource:${RESOURCE} of the type:${RESOURCETYPE} in namespace:${NAMESPACE} in ready state" + ready=true + break + fi + done + + echo "waitForResourcesReady state: $ready" +} + + +waitForArcK8sClusterCreated() { + connectivityState=false + max_retries=60 + sleep_seconds=10 + for i in $(seq 1 $max_retries) + do + echo "iteration: ${i}, clustername: ${CLUSTER_NAME}, resourcegroup: ${RESOURCE_GROUP}" + clusterState=$(az connectedk8s show --name $CLUSTER_NAME --resource-group $RESOURCE_GROUP --query connectivityStatus -o json) + clusterState=$(echo $clusterState | tr -d '"' | tr -d '"\r\n') + echo "cluster current state: ${clusterState}" + if [ ! -z "$clusterState" ]; then + if [[ ("${clusterState}" == "Connected") || ("${clusterState}" == "Connecting") ]]; then + connectivityState=true + break + fi + fi + sleep ${sleep_seconds} + done + echo "Arc K8s cluster connectivityState: $connectivityState" +} + +waitForCIExtensionInstalled() { + installedState=false + max_retries=60 + sleep_seconds=10 + for i in $(seq 1 $max_retries) + do + echo "iteration: ${i}, clustername: ${CLUSTER_NAME}, resourcegroup: ${RESOURCE_GROUP}" + installState=$(az k8s-extension show --cluster-name $CLUSTER_NAME --resource-group $RESOURCE_GROUP --cluster-type connectedClusters --name azuremonitor-containers --query installState -o json) + installState=$(echo $installState | tr -d '"' | tr -d '"\r\n') + echo "extension install state: ${installState}" + if [ ! -z "$installState" ]; then + if [ "${installState}" == "Installed" ]; then + installedState=true + break + fi + fi + sleep ${sleep_seconds} + done + echo "container insights extension installedState: $installedState" +} + +validateCommonParameters() { + if [ -z $TENANT_ID ]; then + echo "ERROR: parameter TENANT_ID is required." > ${results_dir}/error + python3 setup_failure_handler.py + fi + if [ -z $CLIENT_ID ]; then + echo "ERROR: parameter CLIENT_ID is required." > ${results_dir}/error + python3 setup_failure_handler.py + fi + + if [ -z $CLIENT_SECRET ]; then + echo "ERROR: parameter CLIENT_SECRET is required." > ${results_dir}/error + python3 setup_failure_handler.py + fi +} + +validateArcConfTestParameters() { + if [ -z $SUBSCRIPTION_ID ]; then + echo "ERROR: parameter SUBSCRIPTION_ID is required." > ${results_dir}/error + python3 setup_failure_handler.py + fi + + if [ -z $RESOURCE_GROUP ]]; then + echo "ERROR: parameter RESOURCE_GROUP is required." > ${results_dir}/error + python3 setup_failure_handler.py + fi + + if [ -z $CLUSTER_NAME ]; then + echo "ERROR: parameter CLUSTER_NAME is required." > ${results_dir}/error + python3 setup_failure_handler.py + fi +} + +addArcConnectedK8sExtension() { + echo "adding Arc K8s connectedk8s extension" + az extension add --name connectedk8s 2> ${results_dir}/error || python3 setup_failure_handler.py +} + +addArcK8sCLIExtension() { + echo "adding Arc K8s k8s-extension extension" + az extension add --name k8s-extension +} + +createArcCIExtension() { + echo "creating extension type: Microsoft.AzureMonitor.Containers" + basicparameters="--cluster-name $CLUSTER_NAME --resource-group $RESOURCE_GROUP --cluster-type connectedClusters --extension-type Microsoft.AzureMonitor.Containers --scope cluster --name azuremonitor-containers" + if [ ! -z "$CI_ARC_RELEASE_TRAIN" ]; then + basicparameters="$basicparameters --release-train $CI_ARC_RELEASE_TRAIN" + fi + if [ ! -z "$CI_ARC_VERSION" ]; then + basicparameters="$basicparameters --version $CI_ARC_VERSION" + fi + + az k8s-extension create $basicparameters --configuration-settings omsagent.ISTEST=true +} + +showArcCIExtension() { + echo "arc ci extension status" + az k8s-extension show --cluster-name $CLUSTER_NAME --resource-group $RESOURCE_GROUP --cluster-type connectedClusters --name azuremonitor-containers +} + +deleteArcCIExtension() { + az k8s-extension delete --name azuremonitor-containers \ + --cluster-type connectedClusters \ + --cluster-name $CLUSTER_NAME \ + --resource-group $RESOURCE_GROUP --yes +} + +login_to_azure() { + # Login with service principal + echo "login to azure using the SP creds" + az login --service-principal \ + -u ${CLIENT_ID} \ + -p ${CLIENT_SECRET} \ + --tenant ${TENANT_ID} 2> ${results_dir}/error || python3 setup_failure_handler.py + + echo "setting subscription: ${SUBSCRIPTION_ID} as default subscription" + az account set -s $SUBSCRIPTION_ID +} + + # saveResults prepares the results for handoff to the Sonobuoy worker. # See: https://github.com/vmware-tanzu/sonobuoy/blob/master/docs/plugins.md saveResults() { @@ -17,6 +168,50 @@ saveResults() { # Ensure that we tell the Sonobuoy worker we are done regardless of results. trap saveResults EXIT +# validate common params +validateCommonParameters + +IS_ARC_K8S_ENV="true" +if [ -z $IS_NON_ARC_K8S_TEST_ENVIRONMENT ]; then + echo "arc k8s environment" +else + if [ "$IS_NON_ARC_K8S_TEST_ENVIRONMENT" = "true" ]; then + IS_ARC_K8S_ENV="false" + echo "non arc k8s environment" + fi +fi + +if [ "$IS_ARC_K8S_ENV" = "false" ]; then + echo "skipping installing of ARC K8s container insights extension since the test environment is non-arc K8s" +else + # validate params + validateArcConfTestParameters + + # login to azure + login_to_azure + + # add arc k8s connectedk8s extension + addArcConnectedK8sExtension + + # wait for arc k8s pods to be ready state + waitForResourcesReady azure-arc pods + + # wait for Arc K8s cluster to be created + waitForArcK8sClusterCreated + + # add CLI extension + addArcK8sCLIExtension + + # add ARC K8s container insights extension + createArcCIExtension + + # show the ci extension status + showArcCIExtension + + #wait for extension state to be installed + waitForCIExtensionInstalled +fi + # The variable 'TEST_LIST' should be provided if we want to run specific tests. If not provided, all tests are run NUM_PROCESS=$(pytest /e2etests/ --collect-only -k "$TEST_NAME_LIST" -m "$TEST_MARKER_LIST" | grep " Date: Fri, 20 Aug 2021 09:43:42 -0700 Subject: [PATCH 142/194] upgrade golang version for windows in pipeline build and locally (#630) --- ....windows.official.all_tag.all_phase.all_config.ci_prod.yml | 2 +- .pipelines/pipeline.user.windows.yml | 2 +- scripts/build/windows/install-build-pre-requisites.ps1 | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml index 8ae069e90..0dc0a47c5 100644 --- a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml @@ -5,7 +5,7 @@ environment: version: '2019' runtime: provider: 'appcontainer' - image: 'cdpxwin1809.azurecr.io/user/azure-monitor/container-insights:6.0' + image: 'cdpxwin1809.azurecr.io/user/azure-monitor/container-insights:latest' source_mode: 'map' version: diff --git a/.pipelines/pipeline.user.windows.yml b/.pipelines/pipeline.user.windows.yml index 82dd30cd0..e9d0105ab 100644 --- a/.pipelines/pipeline.user.windows.yml +++ b/.pipelines/pipeline.user.windows.yml @@ -5,7 +5,7 @@ environment: version: '2019' runtime: provider: 'appcontainer' - image: 'cdpxwin1809.azurecr.io/user/azure-monitor/container-insights:6.0' + image: 'cdpxwin1809.azurecr.io/user/azure-monitor/container-insights:latest' source_mode: 'map' version: diff --git a/scripts/build/windows/install-build-pre-requisites.ps1 b/scripts/build/windows/install-build-pre-requisites.ps1 index 3bb56ac2a..7f1c9b54f 100755 --- a/scripts/build/windows/install-build-pre-requisites.ps1 +++ b/scripts/build/windows/install-build-pre-requisites.ps1 @@ -13,8 +13,8 @@ function Install-Go { exit } - $url = "https://dl.google.com/go/go1.14.1.windows-amd64.msi" - $output = Join-Path -Path $tempGo -ChildPath "go1.14.1.windows-amd64.msi" + $url = "https://dl.google.com/go/go1.15.14.windows-amd64.msi" + $output = Join-Path -Path $tempGo -ChildPath "go1.15.14.windows-amd64.msi" Write-Host("downloading go msi into directory path : " + $output + " ...") Invoke-WebRequest -Uri $url -OutFile $output -ErrorAction Stop Write-Host("downloading of go msi into directory path : " + $output + " completed") From 3a02a4f89fd8ff8bd47a1de0ab7bbd6f86a65f71 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Tue, 24 Aug 2021 17:37:38 -0700 Subject: [PATCH 143/194] Updating a link in Readme.md (#632) The link to the build pipelines now goes directly to our build pipelines (instead of to all github-private pipelines) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e3ceedc8e..3cd466bb9 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,7 @@ docker push /: # Azure DevOps Build Pipeline -Navigate to https://github-private.visualstudio.com/microsoft/_build?view=pipelines to see Linux and Windows Agent build pipelines. These pipelines are configured with CI triggers for ci_dev and ci_prod. +Navigate to https://github-private.visualstudio.com/microsoft/_build?definitionScope=%5CCDPX%5Cdocker-provider to see Linux and Windows Agent build pipelines. These pipelines are configured with CI triggers for ci_dev and ci_prod. Docker Images will be pushed to CDPX ACR repos and these needs to retagged and pushed to corresponding ACR or docker hub. Only onboarded Azure AD AppId has permission to pull the images from CDPx ACRs. From e56c74ba66744f87e0b630d7f6bbb5ba3a56428c Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 25 Aug 2021 15:30:11 -0700 Subject: [PATCH 144/194] Updating omsagent yaml to have parity with omsagent yaml file in AKS RP (#615) --- kubernetes/omsagent.yaml | 124 +++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 57 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 49d4586c1..d84e46701 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -400,6 +400,8 @@ spec: value: "" - name: AZMON_CONTAINERLOGS_ONEAGENT_REGIONS value: "koreacentral,norwayeast,eastus2" + - name: USING_AAD_MSI_AUTH + value: "false" securityContext: privileged: true ports: @@ -445,59 +447,65 @@ spec: periodSeconds: 60 timeoutSeconds: 15 #Only in sidecar scraping mode - - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021" - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: 500m - memory: 1Gi - requests: - cpu: 75m - memory: 225Mi - env: - # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - - name: AKS_RESOURCE_ID - value: "VALUE_AKS_RESOURCE_ID_VALUE" - - name: AKS_REGION - value: "VALUE_AKS_RESOURCE_REGION_VALUE" - #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - #- name: ACS_RESOURCE_NAME - # value: "my_acs_cluster_name" - - name: CONTAINER_TYPE - value: "PrometheusSidecar" - - name: CONTROLLER_TYPE - value: "DaemonSet" - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - # Update this with the user assigned msi client id for omsagent - - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" - securityContext: - privileged: true - volumeMounts: - - mountPath: /etc/kubernetes/host - name: azure-json-path - - mountPath: /etc/omsagent-secret - name: omsagent-secret - readOnly: true - - mountPath: /etc/config/settings - name: settings-vol-config - readOnly: true - - mountPath: /etc/config/osm-settings - name: osm-settings-vol-config - readOnly: true - livenessProbe: - exec: - command: - - /bin/bash - - -c - - /opt/livenessprobe.sh - initialDelaySeconds: 60 - periodSeconds: 60 - timeoutSeconds: 15 + # - name: omsagent-prometheus + # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + # imagePullPolicy: IfNotPresent + # resources: + # limits: + # cpu: 500m + # memory: 1Gi + # requests: + # cpu: 75m + # memory: 225Mi + # env: + # # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these + # - name: AKS_CLUSTER_NAME + # value: "VALUE_AKS_CLUSTER_NAME" + # - name: AKS_RESOURCE_ID + # value: "VALUE_AKS_RESOURCE_ID_VALUE" + # - name: AKS_REGION + # value: "VALUE_AKS_RESOURCE_REGION_VALUE" + # - name: AKS_NODE_RESOURCE_GROUP + # value: "VALUE_AKS_NODE_RESOURCE_GROUP" + # #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + # #- name: ACS_RESOURCE_NAME + # # value: "my_acs_cluster_name" + # - name: CONTAINER_TYPE + # value: "PrometheusSidecar" + # - name: CONTROLLER_TYPE + # value: "DaemonSet" + # - name: NODE_IP + # valueFrom: + # fieldRef: + # fieldPath: status.hostIP + # # Update this with the user assigned msi client id for omsagent + # - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + # value: "" + # - name: USING_AAD_MSI_AUTH + # value: "false" + # securityContext: + # privileged: true + # volumeMounts: + # - mountPath: /etc/kubernetes/host + # name: azure-json-path + # - mountPath: /etc/omsagent-secret + # name: omsagent-secret + # readOnly: true + # - mountPath: /etc/config/settings + # name: settings-vol-config + # readOnly: true + # - mountPath: /etc/config/osm-settings + # name: osm-settings-vol-config + # readOnly: true + # livenessProbe: + # exec: + # command: + # - /bin/bash + # - -c + # - /opt/livenessprobe.sh + # initialDelaySeconds: 60 + # periodSeconds: 60 + # timeoutSeconds: 15 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -620,7 +628,9 @@ spec: value: "" # Add the below environment variable to true only in sidecar enabled regions, else set it to false - name: SIDECAR_SCRAPING_ENABLED - value: "true" + value: "false" + - name: USING_AAD_MSI_AUTH + value: "false" securityContext: privileged: true ports: @@ -789,13 +799,13 @@ spec: fieldRef: fieldPath: status.hostIP - name: SIDECAR_SCRAPING_ENABLED - value: "true" + value: "false" # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" # Add this only for clouds that require cert bootstrapping - - name: REQUIRES_CERT_BOOTSTRAP - value: "true" + # - name: REQUIRES_CERT_BOOTSTRAP + # value: "true" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers From d2817cb644027e0038e58b8dadd7710e9dfedbf4 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Fri, 27 Aug 2021 01:37:36 -0700 Subject: [PATCH 145/194] Unit test tooling (#625) Added tooling and examples for unit tests --- .github/workflows/run_unit_tests.yml | 30 + .gitignore | 4 + Dev Guide.md | 125 ++ .../installer/datafiles/base_container.data | 8 +- build/windows/Makefile.ps1 | 8 +- kubernetes/windows/Dockerfile | 1 - kubernetes/windows/Dockerfile-dev-image | 1 - source/plugins/go/src/extension/extension.go | 54 +- .../go/src/extension/extension_test.go | 74 + .../plugins/go/src/extension/socket_writer.go | 59 +- source/plugins/go/src/go.mod | 1 + source/plugins/go/src/go.sum | 1 + source/plugins/go/src/utils.go | 120 +- source/plugins/go/src/utils_test.go | 79 + .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 4 +- source/plugins/ruby/KubernetesApiClient.rb | 41 +- source/plugins/ruby/constants.rb | 4 +- source/plugins/{utils => ruby}/extension.rb | 0 .../{utils => ruby}/extension_utils.rb | 0 source/plugins/ruby/in_kube_nodes.rb | 195 +- source/plugins/ruby/in_kube_nodes_test.rb | 171 ++ source/plugins/{utils => ruby}/oms_common.rb | 0 source/plugins/{utils => ruby}/omslog.rb | 0 .../kube-nodes-malformed.txt | 1674 +++++++++++++++++ .../canned-api-responses/kube-nodes.txt | 851 +++++++++ test/unit-tests/run_go_tests.sh | 12 + test/unit-tests/run_ruby_tests.sh | 13 + test/unit-tests/test_driver.rb | 13 + 28 files changed, 3331 insertions(+), 212 deletions(-) create mode 100644 .github/workflows/run_unit_tests.yml create mode 100644 Dev Guide.md create mode 100644 source/plugins/go/src/extension/extension_test.go create mode 100644 source/plugins/go/src/utils_test.go rename source/plugins/{utils => ruby}/extension.rb (100%) rename source/plugins/{utils => ruby}/extension_utils.rb (100%) create mode 100644 source/plugins/ruby/in_kube_nodes_test.rb rename source/plugins/{utils => ruby}/oms_common.rb (100%) rename source/plugins/{utils => ruby}/omslog.rb (100%) create mode 100644 test/unit-tests/canned-api-responses/kube-nodes-malformed.txt create mode 100644 test/unit-tests/canned-api-responses/kube-nodes.txt create mode 100755 test/unit-tests/run_go_tests.sh create mode 100755 test/unit-tests/run_ruby_tests.sh create mode 100644 test/unit-tests/test_driver.rb diff --git a/.github/workflows/run_unit_tests.yml b/.github/workflows/run_unit_tests.yml new file mode 100644 index 000000000..29f5afc7a --- /dev/null +++ b/.github/workflows/run_unit_tests.yml @@ -0,0 +1,30 @@ +name: Run Unit Tests +on: + pull_request: + types: [opened, synchronize, reopened] + branches: + - ci_dev + - ci_prod +jobs: + Golang-Tests: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v2 + - name: Run unit tests + run: | + cd ${{ github.workspace }} + ./test/unit-tests/run_go_tests.sh + Ruby-Tests: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v2 + - name: install fluent + run: | + sudo gem install fluentd -v "1.12.2" --no-document + sudo fluentd --setup ./fluent + - name: Run unit tests + run: | + cd ${{ github.workspace }} + ./test/unit-tests/run_ruby_tests.sh diff --git a/.gitignore b/.gitignore index 2e2978e91..b0467519c 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,7 @@ intermediate kubernetes/linux/Linux_ULINUX_1.0_x64_64_Release # ignore generated .h files for go source/plugins/go/src/*.h +*_mock.go +*_log.txt +*.log +*.byebug_history diff --git a/Dev Guide.md b/Dev Guide.md new file mode 100644 index 000000000..7057a4afe --- /dev/null +++ b/Dev Guide.md @@ -0,0 +1,125 @@ +# Dev Guide + +More advanced information needed to develop or build the docker provider will live here + + + +## Testing +Last updated 8/18/2021 + +To run all unit tests run the commands `test/unit-tests/run_go_tests.sh` and `test/unit-tests/run_ruby_tests.sh` + +#### Conventions: +1. Unit tests should go in their own file, but in the same folder as the source code their testing. For example, the tests for `in_kube_nodes.rb` are in `in_kube_nodes_test.rb`. Both files are in the folder `source/plugin/ruby`. + +### Ruby +Sample tests are provided in [in_kube_nodes_test.rb](source/plugin/ruby/in_kube_nodes_test.rb). They are meant to demo the tooling used for unit tests (as opposed to being comprehensive tests). Basic techniques like mocking are demonstrated there. + +#### Conventions: +1. When modifying a fluentd plugin for unit testing, any mocked classes (like KubernetesApiClient, applicationInsightsUtility, env, etc.) should be passed in as optional arguments of initialize. For example: +``` + def initialize + super +``` +would be turned into +``` + def initialize (kubernetesApiClient=nil, applicationInsightsUtility=nil, extensionUtils=nil, env=nil) + super() +``` + +2. Having end-to-end tests of all fluentd plugins is a longshot. We care more about unit testing smaller blocks of functionality (like all the helper functions in KubeNodeInventory.rb). Unit tests for fluentd plugins are not expected. + +### Golang + +Since golang is statically compiled, mocking requires a lot more work than in ruby. Sample tests are provided in [utils_test.go](source/plugin/go/src/utils_test.go) and [extension_test.go](source/plugin/go/src/extension/extension_test.go). Again, they are meant to demo the tooling used for unit tests (as opposed to being comprehensive tests). Basic techniques like mocking are demonstrated there. + +#### Mocking: +Mocks are generated with gomock (mockgen). +* Mock files should be called *_mock.go (socket_writer.go => socket_writer_mock.go) +* Mocks should not be checked in to git. (they have been added to the .gitignore) +* The command to generate mock files should go in a `//go:generate` comment at the top of the mocked file (see [socket_writer.go](source/plugin/go/src/extension/socket_writer.go) for an example). This way mocks can be generated by the unit test script. +* Mocks also go in the same folder as the mocked files. This is unfortunate, but necessary to avoid circular package dependencies (anyone else feel free to figure out how to move mocks to a separate folder) + +Using mocks is also a little tricky. In order to mock functions in a package with gomock, they must be converted to reciever methods of a struct. This way the struct can be swapped out at runtime to change which implementaions of a method are called. See the example below: + +``` +// declare all functions to be mocked in this interface +type registrationPreCheckerInterface interface { + FUT(string) bool +} + +// Create a struct which implements the above interface +type regPreCheck struct{} + +func (r regPreCheck) FUT(email string) bool { + fmt.Println("real FUT() called") + return true +} + +// Create a global variable and assign it to the struct +var regPreCondVar registrationPreCheckerInterface + +func init() { + regPreCondVar = regPreCheck{} +} +``` + +Now any code wishing to call FUT() will call `regPreCondVar.FUT("")` + +A unit test can substitute its own implementaion of FUT() like so + +``` +// This will hold the mock of FUT we want to substitute +var FUTMock func(email string) bool + +// create a new struct which implements the earlier interface +type regPreCheckMock struct{} + +func (u regPreCheckMock) FUT(email string) bool { + return FUTMock(email) +} +``` + +Everything is set up. Now a unit test can substitute in a mock like so: + +``` +func someUnitTest() { + // This will call the actual implementaion of FUT() + regPreCondVar.FUT("") + + // Now the test creates another struct to substitue. After this like all calls to FUT() will be diverted + regPreCondVar = regPreCheckMock{} + + // substute another function to run instead of FUT() + FUTMock = func(email string) bool { + fmt.Println("FUT 1 called") + return false + } + // This will call the function defined right above + regPreCondVar.FUT("") + + // We can substitue another implementation + FUTMock = func(email string) bool { + fmt.Println("FUT 2 called") + return false + } + regPreCondVar.FUT("") + + // put the old behavior back + regPreCondVar = regPreCheck{} + // this will call the actual implementation of FUT() + regPreCondVar.FUT("") + +} +``` + +A concrete example of this can be found in [socket_writer.go](source/plugin/go/src/extension/socket_writer.go) and [extension_test.go](source/plugin/go/src/extension/extension_test.go). Again, if anybody has a better way feel free to update this guide. + + + +A simpler way to test a specific function is to write wrapper functions. Test code calls the inner function (ReadFileContentsImpl) and product code calls the wrapper function (ReadFileContents). The wrapper function provides any outside state which a unit test would want to control (like a function to read a file). This option makes product code more verbose, but probably easier to read too. Either way is acceptable. +``` +func ReadFileContents(fullPathToFileName string) (string, error) { + return ReadFileContentsImpl(fullPathToFileName, ioutil.ReadFile) +} +``` diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index b71cafd49..d104a5084 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -148,10 +148,10 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/MdmMetricsGenerator.rb; source/plugins/ruby/MdmMetricsGenerator.rb; 644; root; root /etc/fluent/plugin/MdmAlertTemplates.rb; source/plugins/ruby/MdmAlertTemplates.rb; 644; root; root -/etc/fluent/plugin/omslog.rb; source/plugins/utils/omslog.rb; 644; root; root -/etc/fluent/plugin/oms_common.rb; source/plugins/utils/oms_common.rb; 644; root; root -/etc/fluent/plugin/extension.rb; source/plugins/utils/extension.rb; 644; root; root -/etc/fluent/plugin/extension_utils.rb; source/plugins/utils/extension_utils.rb; 644; root; root +/etc/fluent/plugin/omslog.rb; source/plugins/ruby/omslog.rb; 644; root; root +/etc/fluent/plugin/oms_common.rb; source/plugins/ruby/oms_common.rb; 644; root; root +/etc/fluent/plugin/extension.rb; source/plugins/ruby/extension.rb; 644; root; root +/etc/fluent/plugin/extension_utils.rb; source/plugins/ruby/extension_utils.rb; 644; root; root /etc/fluent/kube.conf; build/linux/installer/conf/kube.conf; 644; root; root diff --git a/build/windows/Makefile.ps1 b/build/windows/Makefile.ps1 index 737abc92a..b9bd1f3e4 100644 --- a/build/windows/Makefile.ps1 +++ b/build/windows/Makefile.ps1 @@ -183,11 +183,7 @@ Write-Host("successfully copied installer files conf and scripts from :" + $inst $rubyplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\ruby" Write-Host("copying ruby source files from :" + $rubyplugindir + " to :" + $publishdir + " ...") Copy-Item -Path $rubyplugindir -Destination $publishdir -Recurse -Force +Get-ChildItem $Path | Where{$_.Name -Match ".*_test\.rb"} | Remove-Item Write-Host("successfully copied ruby source files from :" + $rubyplugindir + " to :" + $publishdir + " ") -ForegroundColor Green -$utilsplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\utils" -Write-Host("copying ruby util files from :" + $utilsplugindir + " to :" + $publishdir + " ...") -Copy-Item -Path $utilsplugindir -Destination $publishdir -Recurse -Force -Write-Host("successfully copied ruby util files from :" + $utilsplugindir + " to :" + $publishdir + " ") -ForegroundColor Green - -Set-Location $currentdir \ No newline at end of file +Set-Location $currentdir diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 0ba64cd75..290deef40 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -71,7 +71,6 @@ COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/flue #Copy fluentd ruby plugins COPY ./omsagentwindows/ruby/ /etc/fluent/plugin/ -COPY ./omsagentwindows/utils/*.rb /etc/fluent/plugin/ ENV AGENT_VERSION ${IMAGE_TAG} ENV OS_TYPE "windows" diff --git a/kubernetes/windows/Dockerfile-dev-image b/kubernetes/windows/Dockerfile-dev-image index 6764ef8c4..35aa83bd9 100644 --- a/kubernetes/windows/Dockerfile-dev-image +++ b/kubernetes/windows/Dockerfile-dev-image @@ -33,7 +33,6 @@ COPY ./omsagentwindows/installer/scripts/rubyKeepCertificateAlive/*.rb /etc/flue #Copy fluentd ruby plugins COPY ./omsagentwindows/ruby/ /etc/fluent/plugin/ -COPY ./omsagentwindows/utils/*.rb /etc/fluent/plugin/ ENV AGENT_VERSION ${IMAGE_TAG} ENV OS_TYPE "windows" diff --git a/source/plugins/go/src/extension/extension.go b/source/plugins/go/src/extension/extension.go index c68140ded..4d78380bc 100644 --- a/source/plugins/go/src/extension/extension.go +++ b/source/plugins/go/src/extension/extension.go @@ -1,12 +1,13 @@ package extension -import ( +import ( "encoding/json" "fmt" - "log" + "log" + "strings" "sync" - "strings" - uuid "github.com/google/uuid" + + uuid "github.com/google/uuid" "github.com/ugorji/go/codec" ) @@ -14,31 +15,31 @@ type Extension struct { datatypeStreamIdMap map[string]string } -var singleton *Extension +var singleton *Extension var once sync.Once var extensionconfiglock sync.Mutex var logger *log.Logger -var containerType string +var containerType string -func GetInstance(flbLogger *log.Logger, containerType string) *Extension { - once.Do(func() { - singleton = &Extension{make(map[string]string)} +func GetInstance(flbLogger *log.Logger, containertype string) *Extension { + once.Do(func() { + singleton = &Extension{make(map[string]string)} flbLogger.Println("Extension Instance created") - }) + }) logger = flbLogger - containerType = containerType - return singleton + containerType = containertype + return singleton } func (e *Extension) GetOutputStreamId(datatype string) string { extensionconfiglock.Lock() - defer extensionconfiglock.Unlock() + defer extensionconfiglock.Unlock() if len(e.datatypeStreamIdMap) > 0 && e.datatypeStreamIdMap[datatype] != "" { message := fmt.Sprintf("OutputstreamId: %s for the datatype: %s", e.datatypeStreamIdMap[datatype], datatype) logger.Printf(message) return e.datatypeStreamIdMap[datatype] } - var err error + var err error e.datatypeStreamIdMap, err = getDataTypeToStreamIdMapping() if err != nil { message := fmt.Sprintf("Error getting datatype to streamid mapping: %s", err.Error()) @@ -54,29 +55,30 @@ func getDataTypeToStreamIdMapping() (map[string]string, error) { taggedData := map[string]interface{}{"Request": "AgentTaggedData", "RequestId": guid.String(), "Tag": "ContainerInsights", "Version": "1"} jsonBytes, err := json.Marshal(taggedData) + // TODO: this error is unhandled var data []byte - enc := codec.NewEncoderBytes(&data, new(codec.MsgpackHandle)) + enc := codec.NewEncoderBytes(&data, new(codec.MsgpackHandle)) if err := enc.Encode(string(jsonBytes)); err != nil { return datatypeOutputStreamMap, err } - - fs := &FluentSocketWriter{ } + + fs := &FluentSocket{} fs.sockAddress = "/var/run/mdsd/default_fluent.socket" if containerType != "" && strings.Compare(strings.ToLower(containerType), "prometheussidecar") == 0 { fs.sockAddress = fmt.Sprintf("/var/run/mdsd-%s/default_fluent.socket", containerType) - } - responseBytes, err := fs.WriteAndRead(data) - defer fs.disConnect() + } + responseBytes, err := FluentSocketWriter.writeAndRead(fs, data) + defer FluentSocketWriter.disconnect(fs) logger.Printf("Info::mdsd::Making call to FluentSocket: %s to write and read the config data", fs.sockAddress) if err != nil { return datatypeOutputStreamMap, err } - response := string(responseBytes) + response := string(responseBytes) // TODO: why is this converted to a string then back into a []byte? var responseObjet AgentTaggedDataResponse err = json.Unmarshal([]byte(response), &responseObjet) - if err != nil { + if err != nil { logger.Printf("Error::mdsd::Failed to unmarshal config data. Error message: %s", string(err.Error())) return datatypeOutputStreamMap, err } @@ -84,16 +86,16 @@ func getDataTypeToStreamIdMapping() (map[string]string, error) { var extensionData TaggedData json.Unmarshal([]byte(responseObjet.TaggedData), &extensionData) - extensionConfigs := extensionData.ExtensionConfigs - logger.Printf("Info::mdsd::build the datatype and streamid map -- start") + extensionConfigs := extensionData.ExtensionConfigs + logger.Printf("Info::mdsd::build the datatype and streamid map -- start") for _, extensionConfig := range extensionConfigs { outputStreams := extensionConfig.OutputStreams for dataType, outputStreamID := range outputStreams { logger.Printf("Info::mdsd::datatype: %s, outputstreamId: %s", dataType, outputStreamID) datatypeOutputStreamMap[dataType] = outputStreamID.(string) - } + } } - logger.Printf("Info::mdsd::build the datatype and streamid map -- end") + logger.Printf("Info::mdsd::build the datatype and streamid map -- end") logger.Printf("extensionconfig::getDataTypeToStreamIdMapping:: getting extension config from fluent socket-end") diff --git a/source/plugins/go/src/extension/extension_test.go b/source/plugins/go/src/extension/extension_test.go new file mode 100644 index 000000000..c3b5ef472 --- /dev/null +++ b/source/plugins/go/src/extension/extension_test.go @@ -0,0 +1,74 @@ +package extension + +import ( + "fmt" + "log" + "os" + reflect "reflect" + "testing" + + "github.com/golang/mock/gomock" +) + +type FluentSocketWriterMock struct{} + +func Test_getDataTypeToStreamIdMapping(t *testing.T) { + + type test_struct struct { + testName string + mdsdResponse string + fluentSocket FluentSocket + output map[string]string + err error + } + + // This is a pretty useless unit test, but it demonstrates the concept (putting together a real test + // would require some large json structs). If getDataTypeToStreamIdMapping() is ever updated, that + // would be a good opertunity to add some real test cases. + tests := []test_struct{ + { + "basic test", + "{}", + FluentSocket{}, + map[string]string{}, + nil, + }, + } + + for _, tt := range tests { + t.Run(tt.testName, func(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mock := NewMockIFluentSocketWriter(mockCtrl) + sock := &FluentSocket{} + sock.sockAddress = "/var/run/mdsd/default_fluent.socket" + mock.EXPECT().writeAndRead(sock, gomock.Any()).Return([]byte(tt.mdsdResponse), nil).Times(1) + mock.EXPECT().disconnect(sock).Return(nil).Times(1) + + // This is where calls to the normal socket writer calls are redirected to the mock. + ActualFluentSocketWriter := FluentSocketWriter // save the old struct so that we can put it back later + FluentSocketWriter = mock + + logfile, err := os.Create("logFile.txt") + if err != nil { + fmt.Println(err.Error()) + } + + // use an actual logger here. Using a real logger then cleaning up the log file later is easier than mocking the logger. + GetInstance(log.New(logfile, "", 0), "ContainerType") + defer os.Remove("logFile.txt") + + got, reterr := getDataTypeToStreamIdMapping() + if reterr != nil { + t.Errorf("got error") + t.Errorf(err.Error()) + } + if !reflect.DeepEqual(got, tt.output) { + t.Errorf("getDataTypeToStreamIdMapping() = %v, want %v", got, tt.output) + } + + // stop redirecting method calls to the mock + FluentSocketWriter = ActualFluentSocketWriter + }) + } +} diff --git a/source/plugins/go/src/extension/socket_writer.go b/source/plugins/go/src/extension/socket_writer.go index 1b16b319c..bfd35f5e6 100644 --- a/source/plugins/go/src/extension/socket_writer.go +++ b/source/plugins/go/src/extension/socket_writer.go @@ -4,20 +4,45 @@ import ( "net" ) +//go:generate mockgen -destination=socket_writer_mock.go -package=extension Docker-Provider/source/plugins/go/src/extension IFluentSocketWriter + //MaxRetries for trying to write data to the socket const MaxRetries = 5 //ReadBufferSize for reading data from sockets //Current CI extension config size is ~5KB and going with 20KB to handle any future scenarios -const ReadBufferSize = 20480 +const ReadBufferSize = 20480 //FluentSocketWriter writes data to AMA's default fluent socket -type FluentSocketWriter struct { - socket net.Conn - sockAddress string +type FluentSocket struct { + socket net.Conn + sockAddress string +} + +// begin mocking boilerplate +type IFluentSocketWriter interface { + connect(fluentSocket *FluentSocket) error + disconnect(fluentSocket *FluentSocket) error + writeWithRetries(fluentSocket *FluentSocket, data []byte) (int, error) + read(fluentSocket *FluentSocket) ([]byte, error) + write(fluentSocket *FluentSocket, payload []byte) (int, error) + writeAndRead(fluentSocket *FluentSocket, payload []byte) ([]byte, error) +} + +type FluentSocketWriterImpl struct{} + +// Methods in this file can by mocked by replacing FluentSocketWriter with a different struct. The methods +// in this file are all tied to the FluentSocketWriterImpl struct, but other structs could implement +// IFluentSocketWriter and be used instead +var FluentSocketWriter IFluentSocketWriter + +func init() { + FluentSocketWriter = FluentSocketWriterImpl{} } -func (fs *FluentSocketWriter) connect() error { +// end mocking boilerplate + +func (FluentSocketWriterImpl) connect(fs *FluentSocket) error { c, err := net.Dial("unix", fs.sockAddress) if err != nil { return err @@ -26,15 +51,15 @@ func (fs *FluentSocketWriter) connect() error { return nil } -func (fs *FluentSocketWriter) disConnect() error { - if (fs.socket != nil) { - fs.socket.Close() +func (FluentSocketWriterImpl) disconnect(fs *FluentSocket) error { + if fs.socket != nil { + fs.socket.Close() fs.socket = nil } return nil } -func (fs *FluentSocketWriter) writeWithRetries(data []byte) (int, error) { +func (FluentSocketWriterImpl) writeWithRetries(fs *FluentSocket, data []byte) (int, error) { var ( err error n int @@ -54,7 +79,7 @@ func (fs *FluentSocketWriter) writeWithRetries(data []byte) (int, error) { return 0, err } -func (fs *FluentSocketWriter) read() ([]byte, error) { +func (FluentSocketWriterImpl) read(fs *FluentSocket) ([]byte, error) { buf := make([]byte, ReadBufferSize) n, err := fs.socket.Read(buf) if err != nil { @@ -64,22 +89,22 @@ func (fs *FluentSocketWriter) read() ([]byte, error) { } -func (fs *FluentSocketWriter) Write(payload []byte) (int, error) { +func (FluentSocketWriterImpl) write(fs *FluentSocket, payload []byte) (int, error) { if fs.socket == nil { // previous write failed with permanent error and socket was closed. - if err := fs.connect(); err != nil { + if err := FluentSocketWriter.connect(fs); err != nil { return 0, err } } - return fs.writeWithRetries(payload) + return FluentSocketWriter.writeWithRetries(fs, payload) } -//WriteAndRead writes data to the socket and sends the response back -func (fs *FluentSocketWriter) WriteAndRead(payload []byte) ([]byte, error) { - _, err := fs.Write(payload) +//writeAndRead writes data to the socket and sends the response back +func (FluentSocketWriterImpl) writeAndRead(fs *FluentSocket, payload []byte) ([]byte, error) { + _, err := FluentSocketWriter.write(fs, payload) if err != nil { return nil, err } - return fs.read() + return FluentSocketWriter.read(fs) } diff --git a/source/plugins/go/src/go.mod b/source/plugins/go/src/go.mod index 4ead145ac..58e668597 100644 --- a/source/plugins/go/src/go.mod +++ b/source/plugins/go/src/go.mod @@ -8,6 +8,7 @@ require ( github.com/Azure/go-autorest/autorest/to v0.4.0 // indirect github.com/dnaeon/go-vcr v1.2.0 // indirect github.com/fluent/fluent-bit-go v0.0.0-20171103221316-c4a158a6e3a7 + github.com/golang/mock v1.4.1 github.com/google/uuid v1.1.2 github.com/microsoft/ApplicationInsights-Go v0.4.3 github.com/philhofer/fwd v1.1.1 // indirect diff --git a/source/plugins/go/src/go.sum b/source/plugins/go/src/go.sum index 7f93bb260..ad9e40089 100644 --- a/source/plugins/go/src/go.sum +++ b/source/plugins/go/src/go.sum @@ -130,6 +130,7 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.1 h1:ocYkMQY5RrXTYgXl7ICpV0IXwlEQGwKIsery4gyXa1U= github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 02d30607e..6b3036f85 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -12,8 +12,8 @@ import ( "net/url" "os" "strings" - "time" - + "time" + "github.com/Azure/azure-kusto-go/kusto" "github.com/Azure/azure-kusto-go/kusto/ingest" "github.com/Azure/go-autorest/autorest/azure/auth" @@ -87,7 +87,7 @@ func CreateHTTPClient() { } tlsConfig.BuildNameToCertificate() - transport = &http.Transport{TLSClientConfig: tlsConfig} + transport = &http.Transport{TLSClientConfig: tlsConfig} } // set the proxy if the proxy configured if ProxyEndpoint != "" { @@ -105,7 +105,7 @@ func CreateHTTPClient() { HTTPClient = http.Client{ Transport: transport, Timeout: 30 * time.Second, - } + } Log("Successfully created HTTP Client") } @@ -123,57 +123,57 @@ func ToString(s interface{}) string { //mdsdSocketClient to write msgp messages func CreateMDSDClient(dataType DataType, containerType string) { - mdsdfluentSocket := "/var/run/mdsd/default_fluent.socket" + mdsdfluentSocket := "/var/run/mdsd/default_fluent.socket" if containerType != "" && strings.Compare(strings.ToLower(containerType), "prometheussidecar") == 0 { - mdsdfluentSocket = fmt.Sprintf("/var/run/mdsd-%s/default_fluent.socket", containerType) - } + mdsdfluentSocket = fmt.Sprintf("/var/run/mdsd-%s/default_fluent.socket", containerType) + } switch dataType { - case ContainerLogV2: - if MdsdMsgpUnixSocketClient != nil { - MdsdMsgpUnixSocketClient.Close() - MdsdMsgpUnixSocketClient = nil - } - /*conn, err := fluent.New(fluent.Config{FluentNetwork:"unix", - FluentSocketPath:"/var/run/mdsd/default_fluent.socket", - WriteTimeout: 5 * time.Second, - RequestAck: true}) */ - conn, err := net.DialTimeout("unix", - mdsdfluentSocket, 10*time.Second) - if err != nil { - Log("Error::mdsd::Unable to open MDSD msgp socket connection for ContainerLogV2 %s", err.Error()) - //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) - } else { - Log("Successfully created MDSD msgp socket connection for ContainerLogV2: %s", mdsdfluentSocket) - MdsdMsgpUnixSocketClient = conn - } - case KubeMonAgentEvents: - if MdsdKubeMonMsgpUnixSocketClient != nil { - MdsdKubeMonMsgpUnixSocketClient.Close() - MdsdKubeMonMsgpUnixSocketClient = nil - } - conn, err := net.DialTimeout("unix", - mdsdfluentSocket, 10*time.Second) - if err != nil { - Log("Error::mdsd::Unable to open MDSD msgp socket connection for KubeMon events %s", err.Error()) - //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) - } else { - Log("Successfully created MDSD msgp socket connection for KubeMon events:%s", mdsdfluentSocket) - MdsdKubeMonMsgpUnixSocketClient = conn - } - case InsightsMetrics: - if MdsdInsightsMetricsMsgpUnixSocketClient != nil { - MdsdInsightsMetricsMsgpUnixSocketClient.Close() - MdsdInsightsMetricsMsgpUnixSocketClient = nil - } - conn, err := net.DialTimeout("unix", - mdsdfluentSocket, 10*time.Second) - if err != nil { - Log("Error::mdsd::Unable to open MDSD msgp socket connection for insights metrics %s", err.Error()) - //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) - } else { - Log("Successfully created MDSD msgp socket connection for Insights metrics %s", mdsdfluentSocket) - MdsdInsightsMetricsMsgpUnixSocketClient = conn - } + case ContainerLogV2: + if MdsdMsgpUnixSocketClient != nil { + MdsdMsgpUnixSocketClient.Close() + MdsdMsgpUnixSocketClient = nil + } + /*conn, err := fluent.New(fluent.Config{FluentNetwork:"unix", + FluentSocketPath:"/var/run/mdsd/default_fluent.socket", + WriteTimeout: 5 * time.Second, + RequestAck: true}) */ + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for ContainerLogV2 %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for ContainerLogV2: %s", mdsdfluentSocket) + MdsdMsgpUnixSocketClient = conn + } + case KubeMonAgentEvents: + if MdsdKubeMonMsgpUnixSocketClient != nil { + MdsdKubeMonMsgpUnixSocketClient.Close() + MdsdKubeMonMsgpUnixSocketClient = nil + } + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for KubeMon events %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for KubeMon events:%s", mdsdfluentSocket) + MdsdKubeMonMsgpUnixSocketClient = conn + } + case InsightsMetrics: + if MdsdInsightsMetricsMsgpUnixSocketClient != nil { + MdsdInsightsMetricsMsgpUnixSocketClient.Close() + MdsdInsightsMetricsMsgpUnixSocketClient = nil + } + conn, err := net.DialTimeout("unix", + mdsdfluentSocket, 10*time.Second) + if err != nil { + Log("Error::mdsd::Unable to open MDSD msgp socket connection for insights metrics %s", err.Error()) + //log.Fatalf("Unable to open MDSD msgp socket connection %s", err.Error()) + } else { + Log("Successfully created MDSD msgp socket connection for Insights metrics %s", mdsdfluentSocket) + MdsdInsightsMetricsMsgpUnixSocketClient = conn + } } } @@ -202,11 +202,15 @@ func CreateADXClient() { } func ReadFileContents(fullPathToFileName string) (string, error) { + return ReadFileContentsImpl(fullPathToFileName, ioutil.ReadFile) +} + +func ReadFileContentsImpl(fullPathToFileName string, readfilefunc func(string) ([]byte, error)) (string, error) { fullPathToFileName = strings.TrimSpace(fullPathToFileName) if len(fullPathToFileName) == 0 { return "", errors.New("ReadFileContents::filename is empty") } - content, err := ioutil.ReadFile(fullPathToFileName) //no need to close + content, err := readfilefunc(fullPathToFileName) //no need to close if err != nil { return "", errors.New("ReadFileContents::Unable to open file " + fullPathToFileName) } else { @@ -228,7 +232,7 @@ func isValidUrl(uri string) bool { func convertMsgPackEntriesToMsgpBytes(fluentForwardTag string, msgPackEntries []MsgPackEntry) []byte { var msgpBytes []byte - + fluentForward := MsgPackForward{ Tag: fluentForwardTag, Entries: msgPackEntries, @@ -239,7 +243,7 @@ func convertMsgPackEntriesToMsgpBytes(fluentForwardTag string, msgPackEntries [] msgpSize += 1 + msgp.Int64Size + msgp.GuessSize(fluentForward.Entries[i].Record) } - //allocate buffer for msgp message + //allocate buffer for msgp message msgpBytes = msgp.Require(nil, msgpSize) //construct the stream @@ -252,6 +256,6 @@ func convertMsgPackEntriesToMsgpBytes(fluentForwardTag string, msgPackEntries [] msgpBytes = msgp.AppendInt64(msgpBytes, batchTime) msgpBytes = msgp.AppendMapStrStr(msgpBytes, fluentForward.Entries[entry].Record) } - - return msgpBytes + + return msgpBytes } diff --git a/source/plugins/go/src/utils_test.go b/source/plugins/go/src/utils_test.go new file mode 100644 index 000000000..ab61ce751 --- /dev/null +++ b/source/plugins/go/src/utils_test.go @@ -0,0 +1,79 @@ +package main + +import ( + "errors" + "testing" +) + +func Test_isValidUrl(t *testing.T) { + type test_struct struct { + isValid bool + url string + } + + tests := []test_struct{ + {true, "https://www.microsoft.com"}, + {true, "http://abc.xyz"}, + {true, "https://www.microsoft.com/tests"}, + {false, "()"}, + {false, "https//www.microsoft.com"}, + {false, "https:/www.microsoft.com"}, + {false, "https:/www.microsoft.com*"}, + {false, ""}, + } + + for _, tt := range tests { + t.Run(tt.url, func(t *testing.T) { + got := isValidUrl(tt.url) + if got != tt.isValid { + t.Errorf("isValidUrl(%s) = %t, want %t", tt.url, got, tt.isValid) + } + }) + } +} + +func Test_ReadFileContents(t *testing.T) { + type mock_struct struct { + expectedFilePath string + fileContents []byte + err error + } + type test_struct struct { + testname string + calledFilePath string + subcall_spec mock_struct + output string + err bool + } + + tests := []test_struct{ + {"normal", "foobar.txt", mock_struct{"foobar.txt", []byte("asdf"), nil}, "asdf", false}, + {"extra whitespace", "foobar.txt ", mock_struct{"foobar.txt", []byte("asdf \t"), nil}, "asdf", false}, + {"empty filename", "", mock_struct{"", []byte(""), nil}, "", true}, + {"file doesn't exist", "asdf.txt", mock_struct{"asdf", []byte(""), errors.New("this error doesn't matter much")}, "", true}, + } + + for _, tt := range tests { + t.Run(string(tt.testname), func(t *testing.T) { + + readfileFunc := func(filename string) ([]byte, error) { + if filename == tt.subcall_spec.expectedFilePath { + return tt.subcall_spec.fileContents, nil + } + return []byte(""), errors.New("file not found") + } + + got, err := ReadFileContentsImpl(tt.calledFilePath, readfileFunc) + + if got != tt.output || !(tt.err == (err != nil)) { + t.Errorf("ReadFileContents(%v) = (%v, %v), want (%v, %v)", tt.calledFilePath, got, err, tt.output, tt.err) + if got != tt.output { + t.Errorf("output strings are not equal") + } + if tt.err == (err != nil) { + t.Errorf("errors are not equal") + } + } + }) + } +} diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index da6e94f5f..017bfb08d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -40,9 +40,9 @@ class CAdvisorMetricsAPIClient @os_type = ENV["OS_TYPE"] if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 - @LogPath = "/etc/omsagentwindows/kubernetes_perf_log.txt" + @LogPath = Constants::WINDOWS_LOG_PATH + "kubernetes_perf_log.txt" else - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_perf_log.txt" + @LogPath = Constants::LINUX_LOG_PATH + "kubernetes_perf_log.txt" end @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M # @@rxBytesLast = nil diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 4b50e20d8..8925248d7 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -25,11 +25,12 @@ class KubernetesApiClient #@@IsValidRunningNode = nil #@@IsLinuxCluster = nil @@KubeSystemNamespace = "kube-system" + @os_type = ENV["OS_TYPE"] if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 - @LogPath = "/etc/omsagentwindows/kubernetes_client_log.txt" + @LogPath = Constants::WINDOWS_LOG_PATH + "kubernetes_client_log.txt" else - @LogPath = "/var/opt/microsoft/docker-cimprov/log/kubernetes_client_log.txt" + @LogPath = Constants::LINUX_LOG_PATH + "kubernetes_client_log.txt" end @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@ -87,42 +88,42 @@ def getTokenStr end end - def getClusterRegion - if ENV["AKS_REGION"] - return ENV["AKS_REGION"] + def getClusterRegion(env=ENV) + if env["AKS_REGION"] + return env["AKS_REGION"] else @Log.warn ("Kubernetes environment variable not set AKS_REGION. Unable to get cluster region.") return nil end end - def getResourceUri(resource, api_group) + def getResourceUri(resource, api_group, env=ENV) begin - if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] + if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] if api_group.nil? - return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource + return "https://#{env["KUBERNETES_SERVICE_HOST"]}:#{env["KUBERNETES_PORT_443_TCP_PORT"]}/api/" + @@ApiVersion + "/" + resource elsif api_group == @@ApiGroupApps - return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/apps/" + @@ApiVersionApps + "/" + resource + return "https://#{env["KUBERNETES_SERVICE_HOST"]}:#{env["KUBERNETES_PORT_443_TCP_PORT"]}/apis/apps/" + @@ApiVersionApps + "/" + resource elsif api_group == @@ApiGroupHPA - return "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + @@ApiGroupHPA + "/" + @@ApiVersionHPA + "/" + resource + return "https://#{env["KUBERNETES_SERVICE_HOST"]}:#{env["KUBERNETES_PORT_443_TCP_PORT"]}/apis/" + @@ApiGroupHPA + "/" + @@ApiVersionHPA + "/" + resource end else - @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") + @Log.warn ("Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{env["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{env["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri") return nil end end end - def getClusterName + def getClusterName(env=ENV) return @@ClusterName if !@@ClusterName.nil? @@ClusterName = "None" begin #try getting resource ID for aks - cluster = ENV["AKS_RESOURCE_ID"] + cluster = env["AKS_RESOURCE_ID"] if cluster && !cluster.nil? && !cluster.empty? @@ClusterName = cluster.split("/").last else - cluster = ENV["ACS_RESOURCE_NAME"] + cluster = env["ACS_RESOURCE_NAME"] if cluster && !cluster.nil? && !cluster.empty? @@ClusterName = cluster else @@ -147,7 +148,7 @@ def getClusterName return @@ClusterName end - def getClusterId + def getClusterId(env=ENV) return @@ClusterId if !@@ClusterId.nil? #By default initialize ClusterId to ClusterName. # In ACS/On-prem, we need to figure out how we can generate ClusterId @@ -155,7 +156,7 @@ def getClusterId # e.g. md5 digest is 128 bits = 32 character in hex. Get first 16 and get a guid, and the next 16 to get resource id @@ClusterId = getClusterName begin - cluster = ENV["AKS_RESOURCE_ID"] + cluster = env["AKS_RESOURCE_ID"] if cluster && !cluster.nil? && !cluster.empty? @@ClusterId = cluster end @@ -777,13 +778,13 @@ def getResourcesAndContinuationToken(uri, api_group: nil) return continuationToken, resourceInventory end #getResourcesAndContinuationToken - def getKubeAPIServerUrl + def getKubeAPIServerUrl(env=ENV) apiServerUrl = nil begin - if ENV["KUBERNETES_SERVICE_HOST"] && ENV["KUBERNETES_PORT_443_TCP_PORT"] - apiServerUrl = "https://#{ENV["KUBERNETES_SERVICE_HOST"]}:#{ENV["KUBERNETES_PORT_443_TCP_PORT"]}" + if env["KUBERNETES_SERVICE_HOST"] && env["KUBERNETES_PORT_443_TCP_PORT"] + apiServerUrl = "https://#{env["KUBERNETES_SERVICE_HOST"]}:#{env["KUBERNETES_PORT_443_TCP_PORT"]}" else - @Log.warn "Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{ENV["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{ENV["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri" + @Log.warn "Kubernetes environment variable not set KUBERNETES_SERVICE_HOST: #{env["KUBERNETES_SERVICE_HOST"]} KUBERNETES_PORT_443_TCP_PORT: #{env["KUBERNETES_PORT_443_TCP_PORT"]}. Unable to form resourceUri" end rescue => errorStr @Log.warn "KubernetesApiClient::getKubeAPIServerUrl:Failed #{errorStr}" diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 40fa80c14..69da56488 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -129,5 +129,7 @@ class Constants ONEAGENT_FLUENT_SOCKET_NAME = "/var/run/mdsd/default_fluent.socket" #Tag prefix for output stream EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX = "dcr-" - + + LINUX_LOG_PATH = $in_unit_test.nil? ? "/var/opt/microsoft/docker-cimprov/log/" : "./" + WINDOWS_LOG_PATH = $in_unit_test.nil? ? "/etc/omsagentwindows/" : "./" end diff --git a/source/plugins/utils/extension.rb b/source/plugins/ruby/extension.rb similarity index 100% rename from source/plugins/utils/extension.rb rename to source/plugins/ruby/extension.rb diff --git a/source/plugins/utils/extension_utils.rb b/source/plugins/ruby/extension_utils.rb similarity index 100% rename from source/plugins/utils/extension_utils.rb rename to source/plugins/ruby/extension_utils.rb diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index bc62756a1..a32a32769 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -7,26 +7,13 @@ module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" - @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" - @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" - @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - - - @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] - @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] - @@rsPromFieldDropCount = ENV["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] - @@rsPromK8sServiceCount = ENV["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] - @@rsPromUrlCount = ENV["TELEMETRY_RS_PROM_URLS_LENGTH"] - @@rsPromMonitorPods = ENV["TELEMETRY_RS_PROM_MONITOR_PODS"] - @@rsPromMonitorPodsNamespaceLength = ENV["TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH"] - @@rsPromMonitorPodsLabelSelectorLength = ENV["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] - @@rsPromMonitorPodsFieldSelectorLength = ENV["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] - @@collectAllKubeEvents = ENV["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] - @@osmNamespaceCount = ENV["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] - - def initialize - super + def initialize (kubernetesApiClient=nil, + applicationInsightsUtility=nil, + extensionUtils=nil, + env=nil, + telemetry_flush_interval=nil) + super() + require "yaml" require "yajl/json_gem" require "yajl" @@ -38,6 +25,31 @@ def initialize require_relative "omslog" require_relative "extension_utils" + @kubernetesApiClient = kubernetesApiClient == nil ? KubernetesApiClient : kubernetesApiClient + @applicationInsightsUtility = applicationInsightsUtility == nil ? ApplicationInsightsUtility : applicationInsightsUtility + @extensionUtils = extensionUtils == nil ? ExtensionUtils : extensionUtils + @env = env == nil ? ENV : env + @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = telemetry_flush_interval == nil ? Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES : telemetry_flush_interval + + # these defines were previously at class scope Moving them into the constructor so that they can be set by unit tests + @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" + @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" + @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" + @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" + + + @@rsPromInterval = @env["TELEMETRY_RS_PROM_INTERVAL"] + @@rsPromFieldPassCount = @env["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] + @@rsPromFieldDropCount = @env["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] + @@rsPromK8sServiceCount = @env["TELEMETRY_RS_PROM_K8S_SERVICES_LENGTH"] + @@rsPromUrlCount = @env["TELEMETRY_RS_PROM_URLS_LENGTH"] + @@rsPromMonitorPods = @env["TELEMETRY_RS_PROM_MONITOR_PODS"] + @@rsPromMonitorPodsNamespaceLength = @env["TELEMETRY_RS_PROM_MONITOR_PODS_NS_LENGTH"] + @@rsPromMonitorPodsLabelSelectorLength = @env["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] + @@rsPromMonitorPodsFieldSelectorLength = @env["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] + @@collectAllKubeEvents = @env["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] + @@osmNamespaceCount = @env["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] + @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" @MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @@ -64,8 +76,8 @@ def configure(conf) def start if @run_interval super - if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 - @NODES_CHUNK_SIZE = ENV["NODES_CHUNK_SIZE"].to_i + if !@env["NODES_CHUNK_SIZE"].nil? && !@env["NODES_CHUNK_SIZE"].empty? && @env["NODES_CHUNK_SIZE"].to_i > 0 + @NODES_CHUNK_SIZE = @env["NODES_CHUNK_SIZE"].to_i else # this shouldnt happen just setting default here as safe guard $log.warn("in_kube_nodes::start: setting to default value since got NODES_CHUNK_SIZE nil or empty") @@ -73,8 +85,8 @@ def start end $log.info("in_kube_nodes::start : NODES_CHUNK_SIZE @ #{@NODES_CHUNK_SIZE}") - if !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !ENV["NODES_EMIT_STREAM_BATCH_SIZE"].empty? && ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i > 0 - @NODES_EMIT_STREAM_BATCH_SIZE = ENV["NODES_EMIT_STREAM_BATCH_SIZE"].to_i + if !@env["NODES_EMIT_STREAM_BATCH_SIZE"].nil? && !@env["NODES_EMIT_STREAM_BATCH_SIZE"].empty? && @env["NODES_EMIT_STREAM_BATCH_SIZE"].to_i > 0 + @NODES_EMIT_STREAM_BATCH_SIZE = @env["NODES_EMIT_STREAM_BATCH_SIZE"].to_i else # this shouldnt happen just setting default here as safe guard $log.warn("in_kube_nodes::start: setting to default value since got NODES_EMIT_STREAM_BATCH_SIZE nil or empty") @@ -112,19 +124,19 @@ def enumerate @nodeInventoryE2EProcessingLatencyMs = 0 nodeInventoryStartTime = (Time.now.to_f * 1000).to_i - if ExtensionUtils.isAADMSIAuthMode() + if @extensionUtils.isAADMSIAuthMode() $log.info("in_kube_nodes::enumerate: AAD AUTH MSI MODE") if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + @kubeperfTag = @extensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) end if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + @insightsMetricsTag = @extensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) end if @ContainerNodeInventoryTag.nil? || !@ContainerNodeInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @ContainerNodeInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_NODE_INVENTORY_DATA_TYPE) + @ContainerNodeInventoryTag = @extensionUtils.getOutputStreamId(Constants::CONTAINER_NODE_INVENTORY_DATA_TYPE) end if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) + @tag = @extensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) end $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") @@ -136,8 +148,9 @@ def enumerate # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_nodes::enumerate : Getting nodes from Kube API @ #{Time.now.utc.iso8601}") + # KubernetesApiClient.getNodesResourceUri is a pure function, so call it from the actual module instead of from the mock resourceUri = KubernetesApiClient.getNodesResourceUri("nodes?limit=#{@NODES_CHUNK_SIZE}") - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri) + continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri) $log.info("in_kube_nodes::enumerate : Done getting nodes from Kube API @ #{Time.now.utc.iso8601}") nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) @@ -151,7 +164,7 @@ def enumerate #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i - continuationToken, nodeInventory = KubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") + continuationToken, nodeInventory = @kubernetesApiClient.getResourcesAndContinuationToken(resourceUri + "&continue=#{continuationToken}") nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) @@ -165,9 +178,9 @@ def enumerate @nodeInventoryE2EProcessingLatencyMs = ((Time.now.to_f * 1000).to_i - nodeInventoryStartTime) timeDifference = (DateTime.now.to_time.to_i - @@nodeInventoryLatencyTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - ApplicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) - ApplicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) + if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) + @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end # Setting this to nil so that we dont hold memory until GC kicks in @@ -175,7 +188,7 @@ def enumerate rescue => errorStr $log.warn "in_kube_nodes::enumerate:Failed in enumerate: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + @applicationInsightsUtility.sendExceptionTelemetry(errorStr) end end # end enumerate @@ -188,7 +201,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) containerNodeInventoryEventStream = Fluent::MultiEventStream.new insightsMetricsEventStream = Fluent::MultiEventStream.new kubePerfEventStream = Fluent::MultiEventStream.new - @@istestvar = ENV["ISTEST"] + @@istestvar = @env["ISTEST"] #get node inventory nodeInventory["items"].each do |item| # node inventory @@ -299,49 +312,79 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) # Adding telemetry to send node telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@nodeTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) - properties = getNodeTelemetryProps(item) - properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] - capacityInfo = item["status"]["capacity"] - - ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) begin - if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) - properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] + properties = getNodeTelemetryProps(item) + properties["KubernetesProviderID"] = nodeInventoryRecord["KubernetesProviderID"] + capacityInfo = item["status"]["capacity"] + + ApplicationInsightsUtility.sendMetricTelemetry("NodeMemory", capacityInfo["memory"], properties) + begin + if (!capacityInfo["nvidia.com/gpu"].nil?) && (!capacityInfo["nvidia.com/gpu"].empty?) + properties["nvigpus"] = capacityInfo["nvidia.com/gpu"] + end + + if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) + properties["amdgpus"] = capacityInfo["amd.com/gpu"] + end + rescue => errorStr + $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - if (!capacityInfo["amd.com/gpu"].nil?) && (!capacityInfo["amd.com/gpu"].empty?) - properties["amdgpus"] = capacityInfo["amd.com/gpu"] + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents end - rescue => errorStr - $log.warn "Failed in getting GPU telemetry in_kube_nodes : #{errorStr}" - $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end - # Telemetry for data collection config for replicaset - if (File.file?(@@configMapMountPath)) - properties["collectAllKubeEvents"] = @@collectAllKubeEvents - end + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) + properties["osmNamespaceCount"] = @@osmNamespaceCount + end + ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true - #telemetry about prometheus metric collections settings for replicaset - if (File.file?(@@promConfigMountPath)) - properties["rsPromInt"] = @@rsPromInterval - properties["rsPromFPC"] = @@rsPromFieldPassCount - properties["rsPromFDC"] = @@rsPromFieldDropCount - properties["rsPromServ"] = @@rsPromK8sServiceCount - properties["rsPromUrl"] = @@rsPromUrlCount - properties["rsPromMonPods"] = @@rsPromMonitorPods - properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength - properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength - properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength - end - # telemetry about osm metric settings for replicaset - if (File.file?(@@osmConfigMountPath)) - properties["osmNamespaceCount"] = @@osmNamespaceCount + # Telemetry for data collection config for replicaset + if (File.file?(@@configMapMountPath)) + properties["collectAllKubeEvents"] = @@collectAllKubeEvents + end + + #telemetry about prometheus metric collections settings for replicaset + if (File.file?(@@promConfigMountPath)) + properties["rsPromInt"] = @@rsPromInterval + properties["rsPromFPC"] = @@rsPromFieldPassCount + properties["rsPromFDC"] = @@rsPromFieldDropCount + properties["rsPromServ"] = @@rsPromK8sServiceCount + properties["rsPromUrl"] = @@rsPromUrlCount + properties["rsPromMonPods"] = @@rsPromMonitorPods + properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength + properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength + properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) + properties["osmNamespaceCount"] = @@osmNamespaceCount + end + @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) + telemetrySent = true + rescue => errorStr + $log.warn "Failed in getting telemetry in_kube_nodes : #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + @applicationInsightsUtility.sendExceptionTelemetry(errorStr) end - ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) - telemetrySent = true end end if telemetrySent == true @@ -385,7 +428,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) rescue => errorStr $log.warn "Failed to retrieve node inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + @applicationInsightsUtility.sendExceptionTelemetry(errorStr) end $log.info "in_kube_nodes::parse_and_emit_records:End #{Time.now.utc.iso8601}" end @@ -414,7 +457,7 @@ def run_periodic $log.info("in_kube_nodes::run_periodic.enumerate.end #{Time.now.utc.iso8601}") rescue => errorStr $log.warn "in_kube_nodes::run_periodic: enumerate Failed to retrieve node inventory: #{errorStr}" - ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + @applicationInsightsUtility.sendExceptionTelemetry(errorStr) end end @mutex.lock @@ -428,8 +471,8 @@ def getNodeInventoryRecord(item, batchTime = Time.utc.iso8601) begin record["CollectionTime"] = batchTime #This is the time that is mapped to become TimeGenerated record["Computer"] = item["metadata"]["name"] - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = @kubernetesApiClient.getClusterName + record["ClusterId"] = @kubernetesApiClient.getClusterId record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] record["Labels"] = [item["metadata"]["labels"]] record["Status"] = "" diff --git a/source/plugins/ruby/in_kube_nodes_test.rb b/source/plugins/ruby/in_kube_nodes_test.rb new file mode 100644 index 000000000..8f4984c6c --- /dev/null +++ b/source/plugins/ruby/in_kube_nodes_test.rb @@ -0,0 +1,171 @@ +require 'minitest/autorun' + +require 'fluent/test' +require 'fluent/test/driver/input' +require 'fluent/test/helpers' + +require_relative 'in_kube_nodes.rb' + +class InKubeNodesTests < Minitest::Test + include Fluent::Test::Helpers + + def setup + Fluent::Test.setup + end + + def create_driver(conf = {}, kubernetesApiClient=nil, applicationInsightsUtility=nil, extensionUtils=nil, env=nil, telemetry_flush_interval=nil) + Fluent::Test::Driver::Input.new(Fluent::Plugin::Kube_nodeInventory_Input.new(kubernetesApiClient=kubernetesApiClient, + applicationInsightsUtility=applicationInsightsUtility, + extensionUtils=extensionUtils, + env=env)).configure(conf) + end + + # Collection time of scrapped data will always be different. Overwrite it in any records returned by in_kube_ndes.rb + def overwrite_collection_time(data) + if data.key?("CollectionTime") + data["CollectionTime"] = "~CollectionTime~" + end + if data.key?("Timestamp") + data["Timestamp"] = "~Timestamp~" + end + return data + end + + def test_basic_single_node + kubeApiClient = Minitest::Mock.new + appInsightsUtil = Minitest::Mock.new + extensionUtils = Minitest::Mock.new + env = {} + env["NODES_CHUNK_SIZE"] = "200" + + kubeApiClient.expect(:==, false, [nil]) + appInsightsUtil.expect(:==, false, [nil]) + extensionUtils.expect(:==, false, [nil]) + + # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking + # but it doesn't track how many times isAADMSIAuthMode is called + def extensionUtils.isAADMSIAuthMode + false + end + + nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes.txt").read) + kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"]) + kubeApiClient.expect(:getClusterName, "/cluster-name") + kubeApiClient.expect(:getClusterId, "/cluster-id") + + config = "run_interval 999999999" # only run once + + d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env) + d.instance.start + d.instance.enumerate + d.run(timeout: 99999) # Input plugins decide when to run, so we have to give it enough time to run + + + expected_responses = { ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true, + ["mdm.kubenodeinventory", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"})] => true, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", overwrite_collection_time({"CollectionTime"=>"2021-08-17T20:24:18Z", "Computer"=>"aks-nodepool1-24816391-vmss000000", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"})] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1900000000.0}]"})] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":4787511296.0}]"})] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000000.0}]"})] => true, + ["oneagent.containerInsights.LINUX_PERF_BLOB", overwrite_collection_time({"Timestamp"=>"2021-08-17T20:24:18Z", "Host"=>"aks-nodepool1-24816391-vmss000000", "Computer"=>"aks-nodepool1-24816391-vmss000000", "ObjectName"=>"K8SNode", "InstanceName"=>"None/aks-nodepool1-24816391-vmss000000", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":7291510784.0}]"})] => true} + + d.events.each do |tag, time, record| + cleaned_record = overwrite_collection_time record + if expected_responses.key?([tag, cleaned_record]) + expected_responses[[tag, cleaned_record]] = true + else + assert(false, "got unexpected record") + end + end + + expected_responses.each do |key, val| + assert(val, "expected record not emitted: #{key}") + end + + # make sure all mocked methods were called the expected number of times + kubeApiClient.verify + appInsightsUtil.verify + extensionUtils.verify + end + + # Sometimes customer tooling creates invalid node specs in the Kube API server (its happened more than once). + # This test makes sure that it doesn't creash the entire input plugin and other nodes are still collected + def test_malformed_node_spec + kubeApiClient = Minitest::Mock.new + appInsightsUtil = Minitest::Mock.new + extensionUtils = Minitest::Mock.new + env = {} + env["NODES_CHUNK_SIZE"] = "200" + + kubeApiClient.expect(:==, false, [nil]) + appInsightsUtil.expect(:==, false, [nil]) + extensionUtils.expect(:==, false, [nil]) + + # isAADMSIAuthMode() is called multiple times and we don't really care how many time it is called. This is the same as mocking + # but it doesn't track how many times isAADMSIAuthMode is called + def extensionUtils.isAADMSIAuthMode + false + end + + # Set up the KubernetesApiClient Mock. Note: most of the functions in KubernetesApiClient are pure (access no + # state other than their arguments), so there is no need to mock them (this test file would be far longer and + # more brittle). Instead, in_kube_nodes bypasses the mock and directly calls these functions in KubernetesApiClient. + # Ideally the pure functions in KubernetesApiClient would be refactored into their own file to reduce confusion. + nodes_api_response = eval(File.open("test/unit-tests/canned-api-responses/kube-nodes-malformed.txt").read) + kubeApiClient.expect(:getResourcesAndContinuationToken, [nil, nodes_api_response], ["nodes?limit=200"]) + kubeApiClient.expect(:getClusterName, "/cluster-name") + kubeApiClient.expect(:getClusterName, "/cluster-name") + kubeApiClient.expect(:getClusterId, "/cluster-id") + kubeApiClient.expect(:getClusterId, "/cluster-id") + + def appInsightsUtil.sendExceptionTelemetry(exception) + if exception.to_s != "undefined method `[]' for nil:NilClass" + raise "an unexpected exception has occured" + end + end + + # This test doesn't care if metric telemetry is sent properly. Looking for an unnecessary value would make it needlessly rigid + def appInsightsUtil.sendMetricTelemetry(a, b, c) + end + + config = "run_interval 999999999" # only run once + + d = create_driver(config, kubernetesApiClient=kubeApiClient, applicationInsightsUtility=appInsightsUtil, extensionUtils=extensionUtils, env=env, telemetry_flush_interval=0) + d.instance.start + + d.instance.enumerate + d.run(timeout: 99999) #TODO: is this necessary? + + expected_responses = { + ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, + ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"correct-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"correct-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuAllocatableNanoCores\",\"Value\":1000000.0}]"}] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryAllocatableBytes\",\"Value\":444.0}]"}] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"cpuCapacityNanoCores\",\"Value\":2000000.0}]"}] => false, + ["oneagent.containerInsights.LINUX_PERF_BLOB", {"Timestamp"=>"~Timestamp~", "Host"=>"correct-node", "Computer"=>"correct-node", "ObjectName"=>"K8SNode", "InstanceName"=>"None/correct-node", "json_Collections"=>"[{\"CounterName\":\"memoryCapacityBytes\",\"Value\":555.0}]"}] => false, + + # these records are for the malformed node (it doesn't have limits or requests set so there are no PERF records) + ["oneagent.containerInsights.KUBE_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, + ["mdm.kubenodeinventory", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "ClusterName"=>"/cluster-name", "ClusterId"=>"/cluster-id", "CreationTimeStamp"=>"2021-07-21T23:40:14Z", "Labels"=>[{"agentpool"=>"nodepool1", "beta.kubernetes.io/arch"=>"amd64", "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", "beta.kubernetes.io/os"=>"linux", "failure-domain.beta.kubernetes.io/region"=>"westus2", "failure-domain.beta.kubernetes.io/zone"=>"0", "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", "kubernetes.azure.com/mode"=>"system", "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", "kubernetes.azure.com/os-sku"=>"Ubuntu", "kubernetes.azure.com/role"=>"agent", "kubernetes.io/arch"=>"amd64", "kubernetes.io/hostname"=>"malformed-node", "kubernetes.io/os"=>"linux", "kubernetes.io/role"=>"agent", "node-role.kubernetes.io/agent"=>"", "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", "storageprofile"=>"managed", "storagetier"=>"Premium_LRS", "topology.kubernetes.io/region"=>"westus2", "topology.kubernetes.io/zone"=>"0"}], "Status"=>"Ready", "KubernetesProviderID"=>"azure", "LastTransitionTimeReady"=>"2021-07-21T23:40:24Z", "KubeletVersion"=>"v1.19.11", "KubeProxyVersion"=>"v1.19.11"}] => false, + ["oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB", {"CollectionTime"=>"~CollectionTime~", "Computer"=>"malformed-node", "OperatingSystem"=>"Ubuntu 18.04.5 LTS", "DockerVersion"=>"containerd://1.4.4+azure"}] => false + } + + d.events.each do |tag, time, record| + cleaned_record = overwrite_collection_time record + if expected_responses.key?([tag, cleaned_record]) + expected_responses[[tag, cleaned_record]] = true + end + # don't do anything if an unexpected record was emitted. Since the node spec is malformed, there will be some partial data. + # we care more that the non-malformed data is still emitted + end + + expected_responses.each do |key, val| + assert(val, "expected record not emitted: #{key}") + end + + kubeApiClient.verify + appInsightsUtil.verify + extensionUtils.verify + end +end diff --git a/source/plugins/utils/oms_common.rb b/source/plugins/ruby/oms_common.rb similarity index 100% rename from source/plugins/utils/oms_common.rb rename to source/plugins/ruby/oms_common.rb diff --git a/source/plugins/utils/omslog.rb b/source/plugins/ruby/omslog.rb similarity index 100% rename from source/plugins/utils/omslog.rb rename to source/plugins/ruby/omslog.rb diff --git a/test/unit-tests/canned-api-responses/kube-nodes-malformed.txt b/test/unit-tests/canned-api-responses/kube-nodes-malformed.txt new file mode 100644 index 000000000..bb4c61ca5 --- /dev/null +++ b/test/unit-tests/canned-api-responses/kube-nodes-malformed.txt @@ -0,0 +1,1674 @@ +{ + "kind"=>"NodeList", + "apiVersion"=>"v1", + "metadata"=>{ + "selfLink"=>"/api/v1/nodes", + "resourceVersion"=>"5974879" + }, + "items"=>[ + { + "metadata"=>{ + "name"=>"malformed-node", + "selfLink"=>"/api/v1/nodes/malformed-node", + "uid"=>"fe073f0a-e6bf-4d68-b4e5-ffaa42b91528", + "resourceVersion"=>"5974522", + "creationTimestamp"=>"2021-07-21T23:40:14Z", + "labels"=>{ + "agentpool"=>"nodepool1", + "beta.kubernetes.io/arch"=>"amd64", + "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", + "beta.kubernetes.io/os"=>"linux", + "failure-domain.beta.kubernetes.io/region"=>"westus2", + "failure-domain.beta.kubernetes.io/zone"=>"0", + "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", + "kubernetes.azure.com/mode"=>"system", + "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", + "kubernetes.azure.com/os-sku"=>"Ubuntu", + "kubernetes.azure.com/role"=>"agent", + "kubernetes.io/arch"=>"amd64", + "kubernetes.io/hostname"=>"malformed-node", + "kubernetes.io/os"=>"linux", + "kubernetes.io/role"=>"agent", + "node-role.kubernetes.io/agent"=>"", + "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", + "storageprofile"=>"managed", + "storagetier"=>"Premium_LRS", + "topology.kubernetes.io/region"=>"westus2", + "topology.kubernetes.io/zone"=>"0" + }, + "annotations"=>{ + "node.alpha.kubernetes.io/ttl"=>"0", + "volumes.kubernetes.io/controller-managed-attach-detach"=>"true" + }, + "managedFields"=>[ + { + "manager"=>"kube-controller-manager", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:20Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:annotations"=>{ + "f:node.alpha.kubernetes.io/ttl"=>{} + } + } + } + }, + { + "manager"=>"kubelet", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:24Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:annotations"=>{ + "."=>{}, + "f:volumes.kubernetes.io/controller-managed-attach-detach"=>{} + }, + "f:labels"=>{ + "."=>{}, + "f:agentpool"=>{}, + "f:beta.kubernetes.io/arch"=>{}, + "f:beta.kubernetes.io/instance-type"=>{}, + "f:beta.kubernetes.io/os"=>{}, + "f:failure-domain.beta.kubernetes.io/region"=>{}, + "f:failure-domain.beta.kubernetes.io/zone"=>{}, + "f:kubernetes.azure.com/cluster"=>{}, + "f:kubernetes.azure.com/mode"=>{}, + "f:kubernetes.azure.com/node-image-version"=>{}, + "f:kubernetes.azure.com/os-sku"=>{}, + "f:kubernetes.azure.com/role"=>{}, + "f:kubernetes.io/arch"=>{}, + "f:kubernetes.io/hostname"=>{}, + "f:kubernetes.io/os"=>{}, + "f:node.kubernetes.io/instance-type"=>{}, + "f:storageprofile"=>{}, + "f:storagetier"=>{}, + "f:topology.kubernetes.io/region"=>{}, + "f:topology.kubernetes.io/zone"=>{} + } + }, + "f:spec"=>{ + "f:providerID"=>{} + }, + "f:status"=>{ + "f:addresses"=>{ + "."=>{}, + "k:{\"type\":\"Hostname\"}"=>{ + "."=>{}, + "f:address"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"InternalIP\"}"=>{ + "."=>{}, + "f:address"=>{}, + "f:type"=>{} + } + }, + "f:allocatable"=>{ + "."=>{}, + "f:attachable-volumes-azure-disk"=>{}, + "f:cpu"=>{}, + "f:ephemeral-storage"=>{}, + "f:hugepages-1Gi"=>{}, + "f:hugepages-2Mi"=>{}, + "f:memory"=>{}, + "f:pods"=>{} + }, + "f:capacity"=>{ + "."=>{}, + "f:attachable-volumes-azure-disk"=>{}, + "f:cpu"=>{}, + "f:ephemeral-storage"=>{}, + "f:hugepages-1Gi"=>{}, + "f:hugepages-2Mi"=>{}, + "f:memory"=>{}, + "f:pods"=>{} + }, + "f:conditions"=>{ + "."=>{}, + "k:{\"type\":\"DiskPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"MemoryPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"PIDPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"Ready\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + } + }, + "f:config"=>{}, + "f:daemonEndpoints"=>{ + "f:kubeletEndpoint"=>{ + "f:Port"=>{} + } + }, + "f:images"=>{}, + "f:nodeInfo"=>{ + "f:architecture"=>{}, + "f:bootID"=>{}, + "f:containerRuntimeVersion"=>{}, + "f:kernelVersion"=>{}, + "f:kubeProxyVersion"=>{}, + "f:kubeletVersion"=>{}, + "f:machineID"=>{}, + "f:operatingSystem"=>{}, + "f:osImage"=>{}, + "f:systemUUID"=>{} + } + } + } + }, + { + "manager"=>"kubectl-label", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:53Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:labels"=>{ + "f:kubernetes.io/role"=>{}, + "f:node-role.kubernetes.io/agent"=>{} + } + } + } + }, + { + "manager"=>"node-problem-detector", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-08-10T18:10:02Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:status"=>{ + "f:conditions"=>{ + "k:{\"type\":\"ContainerRuntimeProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FilesystemCorruptionProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FreezeScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentContainerdRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentDockerRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentKubeletRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentUnregisterNetDevice\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"KernelDeadlock\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"KubeletProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"PreemptScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"ReadonlyFilesystem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"RebootScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"RedeployScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"TerminateScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + } + } + } + } + } + ] + }, + "spec"=>{ + "providerID"=>"azure:///subscriptions/3b875bf3-0eec-4d8c-bdee-25c7ccc1f130/resourceGroups/mc_davidaks16_davidaks16_westus2/providers/Microsoft.Compute/virtualMachineScaleSets/aks-nodepool1-24816391-vmss/virtualMachines/0" + }, + "status"=>{ + "conditions"=>[ + { + "type"=>"FrequentDockerRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentDockerRestart", + "message"=>"docker is functioning properly" + }, + { + "type"=>"FilesystemCorruptionProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"FilesystemIsOK", + "message"=>"Filesystem is healthy" + }, + { + "type"=>"KernelDeadlock", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"KernelHasNoDeadlock", + "message"=>"kernel has no deadlock" + }, + { + "type"=>"FrequentContainerdRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentContainerdRestart", + "message"=>"containerd is functioning properly" + }, + { + "type"=>"FreezeScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-11T23:25:04Z", + "reason"=>"NoFreezeScheduled", + "message"=>"VM has no scheduled Freeze event" + }, + { + "type"=>"FrequentUnregisterNetDevice", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentUnregisterNetDevice", + "message"=>"node is functioning properly" + }, + { + "type"=>"TerminateScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoTerminateScheduled", + "message"=>"VM has no scheduled Terminate event" + }, + { + "type"=>"ReadonlyFilesystem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"FilesystemIsNotReadOnly", + "message"=>"Filesystem is not read-only" + }, + { + "type"=>"RedeployScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoRedeployScheduled", + "message"=>"VM has no scheduled Redeploy event" + }, + { + "type"=>"KubeletProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"KubeletIsUp", + "message"=>"kubelet service is up" + }, + { + "type"=>"PreemptScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:11:11Z", + "reason"=>"NoPreemptScheduled", + "message"=>"VM has no scheduled Preempt event" + }, + { + "type"=>"RebootScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoRebootScheduled", + "message"=>"VM has no scheduled Reboot event" + }, + { + "type"=>"ContainerRuntimeProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"ContainerRuntimeIsUp", + "message"=>"container runtime service is up" + }, + { + "type"=>"FrequentKubeletRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentKubeletRestart", + "message"=>"kubelet is functioning properly" + }, + { + "type"=>"MemoryPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasSufficientMemory", + "message"=>"kubelet has sufficient memory available" + }, + { + "type"=>"DiskPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasNoDiskPressure", + "message"=>"kubelet has no disk pressure" + }, + { + "type"=>"PIDPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasSufficientPID", + "message"=>"kubelet has sufficient PID available" + }, + { + "type"=>"Ready", + "status"=>"True", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:24Z", + "reason"=>"KubeletReady", + "message"=>"kubelet is posting ready status. AppArmor enabled" + } + ], + "addresses"=>[ + { + "type"=>"Hostname", + "address"=>"malformed-node" + }, + { + "type"=>"InternalIP", + "address"=>"10.240.0.4" + } + ], + "daemonEndpoints"=>{ + "kubeletEndpoint"=>{ + "Port"=>10250 + } + }, + "nodeInfo"=>{ + "machineID"=>"17a654260e2c4a9bb3a3eb4b4188e4b4", + "systemUUID"=>"7ff599e4-909e-4950-a044-ff8613af3af9", + "bootID"=>"02bb865b-a469-43cd-8b0b-5ceb4ecd80b0", + "kernelVersion"=>"5.4.0-1051-azure", + "osImage"=>"Ubuntu 18.04.5 LTS", + "containerRuntimeVersion"=>"containerd://1.4.4+azure", + "kubeletVersion"=>"v1.19.11", + "kubeProxyVersion"=>"v1.19.11", + "operatingSystem"=>"linux", + "architecture"=>"amd64" + }, + "images"=>[ + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021-1" + ], + "sizeBytes"=>331689060 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + ], + "sizeBytes"=>330099815 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021-hotfix" + ], + "sizeBytes"=>271471426 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" + ], + "sizeBytes"=>269703297 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + ], + "sizeBytes"=>264732875 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/ingress/nginx-ingress-controller:0.19.0" + ], + "sizeBytes"=>166352383 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210623.2" + ], + "sizeBytes"=>147750148 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210524.1" + ], + "sizeBytes"=>146446618 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210427.1" + ], + "sizeBytes"=>136242776 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.8.9.5" + ], + "sizeBytes"=>101794833 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/ingress/nginx-ingress-controller:0.47.0" + ], + "sizeBytes"=>101445696 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/autoscaler/cluster-proportional-autoscaler:1.3.0_v0.0.5" + ], + "sizeBytes"=>101194562 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210623.2" + ], + "sizeBytes"=>96125176 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210524.1" + ], + "sizeBytes"=>95879501 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/exechealthz:1.2_v0.0.5" + ], + "sizeBytes"=>94348102 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.8.9.2" + ], + "sizeBytes"=>93537927 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/acc/sgx-attestation:2.0" + ], + "sizeBytes"=>91841669 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azurefile-csi:v1.4.0" + ], + "sizeBytes"=>91324193 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azurefile-csi:v1.2.0" + ], + "sizeBytes"=>89103171 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.0.1-rc3" + ], + "sizeBytes"=>86839805 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.2.0" + ], + "sizeBytes"=>86488586 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210427.1" + ], + "sizeBytes"=>86120048 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.3.0" + ], + "sizeBytes"=>81252495 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.4.0" + ], + "sizeBytes"=>79586703 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azuredisk-csi:v1.4.0" + ], + "sizeBytes"=>78795016 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azuredisk-csi:v1.2.0" + ], + "sizeBytes"=>76527179 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.1.8" + ], + "sizeBytes"=>75025803 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.2_hotfix" + ], + "sizeBytes"=>73533889 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.3.1" + ], + "sizeBytes"=>72242894 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.8" + ], + "sizeBytes"=>70622822 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/nvidia/k8s-device-plugin:v0.9.0" + ], + "sizeBytes"=>67291599 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.1" + ], + "sizeBytes"=>66415836 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.0-rc7" + ], + "sizeBytes"=>65965658 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.1" + ], + "sizeBytes"=>64123775 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/cni:v3.8.9.3" + ], + "sizeBytes"=>63581323 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8" + ], + "sizeBytes"=>63154716 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/cni:v3.8.9.2" + ], + "sizeBytes"=>61626312 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.18.1" + ], + "sizeBytes"=>60500885 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.17.2" + ], + "sizeBytes"=>58419768 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8_hotfix", + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8post2" + ], + "sizeBytes"=>56368756 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy@sha256:282543237a1aa3f407656290f454b7068a92e1abe2156082c750d5abfbcad90c", + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526.2" + ], + "sizeBytes"=>56310724 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.19.0" + ], + "sizeBytes"=>55228749 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526.1" + ], + "sizeBytes"=>54692048 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.0-rc3" + ], + "sizeBytes"=>50803639 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/secrets-store/driver:v0.0.19" + ], + "sizeBytes"=>49759361 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/azure/aad-pod-identity/nmi:v1.7.5" + ], + "sizeBytes"=>49704644 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/secrets-store/driver:v0.0.21" + ], + "sizeBytes"=>49372390 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy@sha256:a64d3538b72905b07356881314755b02db3675ff47ee2bcc49dd7be856e285d5", + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526" + ], + "sizeBytes"=>49322942 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/azure/aad-pod-identity/nmi:v1.7.4" + ], + "sizeBytes"=>48108311 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kubernetes-dashboard:v1.10.1" + ], + "sizeBytes"=>44907744 + } + ], + "config"=>{} + } + }, + { + "metadata"=>{ + "name"=>"correct-node", + "selfLink"=>"/api/v1/nodes/correct-node", + "uid"=>"fe073f0a-e6bf-4d68-b4e5-ffaa42b91528", + "resourceVersion"=>"5974522", + "creationTimestamp"=>"2021-07-21T23:40:14Z", + "labels"=>{ + "agentpool"=>"nodepool1", + "beta.kubernetes.io/arch"=>"amd64", + "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", + "beta.kubernetes.io/os"=>"linux", + "failure-domain.beta.kubernetes.io/region"=>"westus2", + "failure-domain.beta.kubernetes.io/zone"=>"0", + "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", + "kubernetes.azure.com/mode"=>"system", + "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", + "kubernetes.azure.com/os-sku"=>"Ubuntu", + "kubernetes.azure.com/role"=>"agent", + "kubernetes.io/arch"=>"amd64", + "kubernetes.io/hostname"=>"correct-node", + "kubernetes.io/os"=>"linux", + "kubernetes.io/role"=>"agent", + "node-role.kubernetes.io/agent"=>"", + "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", + "storageprofile"=>"managed", + "storagetier"=>"Premium_LRS", + "topology.kubernetes.io/region"=>"westus2", + "topology.kubernetes.io/zone"=>"0" + }, + "annotations"=>{ + "node.alpha.kubernetes.io/ttl"=>"0", + "volumes.kubernetes.io/controller-managed-attach-detach"=>"true" + }, + "managedFields"=>[ + { + "manager"=>"kube-controller-manager", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:20Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:annotations"=>{ + "f:node.alpha.kubernetes.io/ttl"=>{} + } + } + } + }, + { + "manager"=>"kubelet", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:24Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:annotations"=>{ + "."=>{}, + "f:volumes.kubernetes.io/controller-managed-attach-detach"=>{} + }, + "f:labels"=>{ + "."=>{}, + "f:agentpool"=>{}, + "f:beta.kubernetes.io/arch"=>{}, + "f:beta.kubernetes.io/instance-type"=>{}, + "f:beta.kubernetes.io/os"=>{}, + "f:failure-domain.beta.kubernetes.io/region"=>{}, + "f:failure-domain.beta.kubernetes.io/zone"=>{}, + "f:kubernetes.azure.com/cluster"=>{}, + "f:kubernetes.azure.com/mode"=>{}, + "f:kubernetes.azure.com/node-image-version"=>{}, + "f:kubernetes.azure.com/os-sku"=>{}, + "f:kubernetes.azure.com/role"=>{}, + "f:kubernetes.io/arch"=>{}, + "f:kubernetes.io/hostname"=>{}, + "f:kubernetes.io/os"=>{}, + "f:node.kubernetes.io/instance-type"=>{}, + "f:storageprofile"=>{}, + "f:storagetier"=>{}, + "f:topology.kubernetes.io/region"=>{}, + "f:topology.kubernetes.io/zone"=>{} + } + }, + "f:spec"=>{ + "f:providerID"=>{} + }, + "f:status"=>{ + "f:addresses"=>{ + "."=>{}, + "k:{\"type\":\"Hostname\"}"=>{ + "."=>{}, + "f:address"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"InternalIP\"}"=>{ + "."=>{}, + "f:address"=>{}, + "f:type"=>{} + } + }, + "f:allocatable"=>{ + "."=>{}, + "f:attachable-volumes-azure-disk"=>{}, + "f:cpu"=>{}, + "f:ephemeral-storage"=>{}, + "f:hugepages-1Gi"=>{}, + "f:hugepages-2Mi"=>{}, + "f:memory"=>{}, + "f:pods"=>{} + }, + "f:capacity"=>{ + "."=>{}, + "f:attachable-volumes-azure-disk"=>{}, + "f:cpu"=>{}, + "f:ephemeral-storage"=>{}, + "f:hugepages-1Gi"=>{}, + "f:hugepages-2Mi"=>{}, + "f:memory"=>{}, + "f:pods"=>{} + }, + "f:conditions"=>{ + "."=>{}, + "k:{\"type\":\"DiskPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"MemoryPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"PIDPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"Ready\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + } + }, + "f:config"=>{}, + "f:daemonEndpoints"=>{ + "f:kubeletEndpoint"=>{ + "f:Port"=>{} + } + }, + "f:images"=>{}, + "f:nodeInfo"=>{ + "f:architecture"=>{}, + "f:bootID"=>{}, + "f:containerRuntimeVersion"=>{}, + "f:kernelVersion"=>{}, + "f:kubeProxyVersion"=>{}, + "f:kubeletVersion"=>{}, + "f:machineID"=>{}, + "f:operatingSystem"=>{}, + "f:osImage"=>{}, + "f:systemUUID"=>{} + } + } + } + }, + { + "manager"=>"kubectl-label", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:53Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:labels"=>{ + "f:kubernetes.io/role"=>{}, + "f:node-role.kubernetes.io/agent"=>{} + } + } + } + }, + { + "manager"=>"node-problem-detector", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-08-10T18:10:02Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:status"=>{ + "f:conditions"=>{ + "k:{\"type\":\"ContainerRuntimeProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FilesystemCorruptionProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FreezeScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentContainerdRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentDockerRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentKubeletRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentUnregisterNetDevice\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"KernelDeadlock\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"KubeletProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"PreemptScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"ReadonlyFilesystem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"RebootScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"RedeployScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"TerminateScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + } + } + } + } + } + ] + }, + "spec"=>{ + "providerID"=>"azure:///subscriptions/3b875bf3-0eec-4d8c-bdee-25c7ccc1f130/resourceGroups/mc_davidaks16_davidaks16_westus2/providers/Microsoft.Compute/virtualMachineScaleSets/aks-nodepool1-24816391-vmss/virtualMachines/0" + }, + "status"=>{ + "capacity"=>{ + "attachable-volumes-azure-disk"=>"8", + "cpu"=>"2m", + "ephemeral-storage"=>"666", + "hugepages-1Gi"=>"0", + "hugepages-2Mi"=>"0", + "memory"=>"555", + "pods"=>"30" + }, + "allocatable"=>{ + "attachable-volumes-azure-disk"=>"8", + "cpu"=>"1m", + "ephemeral-storage"=>"333", + "hugepages-1Gi"=>"0", + "hugepages-2Mi"=>"0", + "memory"=>"444", + "pods"=>"30" + }, + "conditions"=>[ + { + "type"=>"FrequentDockerRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentDockerRestart", + "message"=>"docker is functioning properly" + }, + { + "type"=>"FilesystemCorruptionProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"FilesystemIsOK", + "message"=>"Filesystem is healthy" + }, + { + "type"=>"KernelDeadlock", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"KernelHasNoDeadlock", + "message"=>"kernel has no deadlock" + }, + { + "type"=>"FrequentContainerdRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentContainerdRestart", + "message"=>"containerd is functioning properly" + }, + { + "type"=>"FreezeScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-11T23:25:04Z", + "reason"=>"NoFreezeScheduled", + "message"=>"VM has no scheduled Freeze event" + }, + { + "type"=>"FrequentUnregisterNetDevice", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentUnregisterNetDevice", + "message"=>"node is functioning properly" + }, + { + "type"=>"TerminateScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoTerminateScheduled", + "message"=>"VM has no scheduled Terminate event" + }, + { + "type"=>"ReadonlyFilesystem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"FilesystemIsNotReadOnly", + "message"=>"Filesystem is not read-only" + }, + { + "type"=>"RedeployScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoRedeployScheduled", + "message"=>"VM has no scheduled Redeploy event" + }, + { + "type"=>"KubeletProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"KubeletIsUp", + "message"=>"kubelet service is up" + }, + { + "type"=>"PreemptScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:11:11Z", + "reason"=>"NoPreemptScheduled", + "message"=>"VM has no scheduled Preempt event" + }, + { + "type"=>"RebootScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoRebootScheduled", + "message"=>"VM has no scheduled Reboot event" + }, + { + "type"=>"ContainerRuntimeProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"ContainerRuntimeIsUp", + "message"=>"container runtime service is up" + }, + { + "type"=>"FrequentKubeletRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentKubeletRestart", + "message"=>"kubelet is functioning properly" + }, + { + "type"=>"MemoryPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasSufficientMemory", + "message"=>"kubelet has sufficient memory available" + }, + { + "type"=>"DiskPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasNoDiskPressure", + "message"=>"kubelet has no disk pressure" + }, + { + "type"=>"PIDPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasSufficientPID", + "message"=>"kubelet has sufficient PID available" + }, + { + "type"=>"Ready", + "status"=>"True", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:24Z", + "reason"=>"KubeletReady", + "message"=>"kubelet is posting ready status. AppArmor enabled" + } + ], + "addresses"=>[ + { + "type"=>"Hostname", + "address"=>"correct-node" + }, + { + "type"=>"InternalIP", + "address"=>"10.240.0.4" + } + ], + "daemonEndpoints"=>{ + "kubeletEndpoint"=>{ + "Port"=>10250 + } + }, + "nodeInfo"=>{ + "machineID"=>"17a654260e2c4a9bb3a3eb4b4188e4b4", + "systemUUID"=>"7ff599e4-909e-4950-a044-ff8613af3af9", + "bootID"=>"02bb865b-a469-43cd-8b0b-5ceb4ecd80b0", + "kernelVersion"=>"5.4.0-1051-azure", + "osImage"=>"Ubuntu 18.04.5 LTS", + "containerRuntimeVersion"=>"containerd://1.4.4+azure", + "kubeletVersion"=>"v1.19.11", + "kubeProxyVersion"=>"v1.19.11", + "operatingSystem"=>"linux", + "architecture"=>"amd64" + }, + "images"=>[ + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021-1" + ], + "sizeBytes"=>331689060 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + ], + "sizeBytes"=>330099815 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021-hotfix" + ], + "sizeBytes"=>271471426 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" + ], + "sizeBytes"=>269703297 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + ], + "sizeBytes"=>264732875 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/ingress/nginx-ingress-controller:0.19.0" + ], + "sizeBytes"=>166352383 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210623.2" + ], + "sizeBytes"=>147750148 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210524.1" + ], + "sizeBytes"=>146446618 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210427.1" + ], + "sizeBytes"=>136242776 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.8.9.5" + ], + "sizeBytes"=>101794833 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/ingress/nginx-ingress-controller:0.47.0" + ], + "sizeBytes"=>101445696 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/autoscaler/cluster-proportional-autoscaler:1.3.0_v0.0.5" + ], + "sizeBytes"=>101194562 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210623.2" + ], + "sizeBytes"=>96125176 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210524.1" + ], + "sizeBytes"=>95879501 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/exechealthz:1.2_v0.0.5" + ], + "sizeBytes"=>94348102 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.8.9.2" + ], + "sizeBytes"=>93537927 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/acc/sgx-attestation:2.0" + ], + "sizeBytes"=>91841669 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azurefile-csi:v1.4.0" + ], + "sizeBytes"=>91324193 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azurefile-csi:v1.2.0" + ], + "sizeBytes"=>89103171 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.0.1-rc3" + ], + "sizeBytes"=>86839805 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.2.0" + ], + "sizeBytes"=>86488586 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210427.1" + ], + "sizeBytes"=>86120048 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.3.0" + ], + "sizeBytes"=>81252495 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.4.0" + ], + "sizeBytes"=>79586703 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azuredisk-csi:v1.4.0" + ], + "sizeBytes"=>78795016 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azuredisk-csi:v1.2.0" + ], + "sizeBytes"=>76527179 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.1.8" + ], + "sizeBytes"=>75025803 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.2_hotfix" + ], + "sizeBytes"=>73533889 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.3.1" + ], + "sizeBytes"=>72242894 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.8" + ], + "sizeBytes"=>70622822 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/nvidia/k8s-device-plugin:v0.9.0" + ], + "sizeBytes"=>67291599 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.1" + ], + "sizeBytes"=>66415836 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.0-rc7" + ], + "sizeBytes"=>65965658 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.1" + ], + "sizeBytes"=>64123775 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/cni:v3.8.9.3" + ], + "sizeBytes"=>63581323 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8" + ], + "sizeBytes"=>63154716 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/cni:v3.8.9.2" + ], + "sizeBytes"=>61626312 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.18.1" + ], + "sizeBytes"=>60500885 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.17.2" + ], + "sizeBytes"=>58419768 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8_hotfix", + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8post2" + ], + "sizeBytes"=>56368756 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy@sha256:282543237a1aa3f407656290f454b7068a92e1abe2156082c750d5abfbcad90c", + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526.2" + ], + "sizeBytes"=>56310724 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.19.0" + ], + "sizeBytes"=>55228749 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526.1" + ], + "sizeBytes"=>54692048 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.0-rc3" + ], + "sizeBytes"=>50803639 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/secrets-store/driver:v0.0.19" + ], + "sizeBytes"=>49759361 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/azure/aad-pod-identity/nmi:v1.7.5" + ], + "sizeBytes"=>49704644 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/secrets-store/driver:v0.0.21" + ], + "sizeBytes"=>49372390 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy@sha256:a64d3538b72905b07356881314755b02db3675ff47ee2bcc49dd7be856e285d5", + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526" + ], + "sizeBytes"=>49322942 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/azure/aad-pod-identity/nmi:v1.7.4" + ], + "sizeBytes"=>48108311 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kubernetes-dashboard:v1.10.1" + ], + "sizeBytes"=>44907744 + } + ], + "config"=>{} + } + } + ] +} \ No newline at end of file diff --git a/test/unit-tests/canned-api-responses/kube-nodes.txt b/test/unit-tests/canned-api-responses/kube-nodes.txt new file mode 100644 index 000000000..ed411c2e5 --- /dev/null +++ b/test/unit-tests/canned-api-responses/kube-nodes.txt @@ -0,0 +1,851 @@ +{ + "kind"=>"NodeList", + "apiVersion"=>"v1", + "metadata"=>{ + "selfLink"=>"/api/v1/nodes", + "resourceVersion"=>"5974879" + }, + "items"=>[ + { + "metadata"=>{ + "name"=>"aks-nodepool1-24816391-vmss000000", + "selfLink"=>"/api/v1/nodes/aks-nodepool1-24816391-vmss000000", + "uid"=>"fe073f0a-e6bf-4d68-b4e5-ffaa42b91528", + "resourceVersion"=>"5974522", + "creationTimestamp"=>"2021-07-21T23:40:14Z", + "labels"=>{ + "agentpool"=>"nodepool1", + "beta.kubernetes.io/arch"=>"amd64", + "beta.kubernetes.io/instance-type"=>"Standard_DS2_v2", + "beta.kubernetes.io/os"=>"linux", + "failure-domain.beta.kubernetes.io/region"=>"westus2", + "failure-domain.beta.kubernetes.io/zone"=>"0", + "kubernetes.azure.com/cluster"=>"MC_davidaks16_davidaks16_westus2", + "kubernetes.azure.com/mode"=>"system", + "kubernetes.azure.com/node-image-version"=>"AKSUbuntu-1804gen2containerd-2021.07.03", + "kubernetes.azure.com/os-sku"=>"Ubuntu", + "kubernetes.azure.com/role"=>"agent", + "kubernetes.io/arch"=>"amd64", + "kubernetes.io/hostname"=>"aks-nodepool1-24816391-vmss000000", + "kubernetes.io/os"=>"linux", + "kubernetes.io/role"=>"agent", + "node-role.kubernetes.io/agent"=>"", + "node.kubernetes.io/instance-type"=>"Standard_DS2_v2", + "storageprofile"=>"managed", + "storagetier"=>"Premium_LRS", + "topology.kubernetes.io/region"=>"westus2", + "topology.kubernetes.io/zone"=>"0" + }, + "annotations"=>{ + "node.alpha.kubernetes.io/ttl"=>"0", + "volumes.kubernetes.io/controller-managed-attach-detach"=>"true" + }, + "managedFields"=>[ + { + "manager"=>"kube-controller-manager", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:20Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:annotations"=>{ + "f:node.alpha.kubernetes.io/ttl"=>{} + } + } + } + }, + { + "manager"=>"kubelet", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:24Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:annotations"=>{ + "."=>{}, + "f:volumes.kubernetes.io/controller-managed-attach-detach"=>{} + }, + "f:labels"=>{ + "."=>{}, + "f:agentpool"=>{}, + "f:beta.kubernetes.io/arch"=>{}, + "f:beta.kubernetes.io/instance-type"=>{}, + "f:beta.kubernetes.io/os"=>{}, + "f:failure-domain.beta.kubernetes.io/region"=>{}, + "f:failure-domain.beta.kubernetes.io/zone"=>{}, + "f:kubernetes.azure.com/cluster"=>{}, + "f:kubernetes.azure.com/mode"=>{}, + "f:kubernetes.azure.com/node-image-version"=>{}, + "f:kubernetes.azure.com/os-sku"=>{}, + "f:kubernetes.azure.com/role"=>{}, + "f:kubernetes.io/arch"=>{}, + "f:kubernetes.io/hostname"=>{}, + "f:kubernetes.io/os"=>{}, + "f:node.kubernetes.io/instance-type"=>{}, + "f:storageprofile"=>{}, + "f:storagetier"=>{}, + "f:topology.kubernetes.io/region"=>{}, + "f:topology.kubernetes.io/zone"=>{} + } + }, + "f:spec"=>{ + "f:providerID"=>{} + }, + "f:status"=>{ + "f:addresses"=>{ + "."=>{}, + "k:{\"type\":\"Hostname\"}"=>{ + "."=>{}, + "f:address"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"InternalIP\"}"=>{ + "."=>{}, + "f:address"=>{}, + "f:type"=>{} + } + }, + "f:allocatable"=>{ + "."=>{}, + "f:attachable-volumes-azure-disk"=>{}, + "f:cpu"=>{}, + "f:ephemeral-storage"=>{}, + "f:hugepages-1Gi"=>{}, + "f:hugepages-2Mi"=>{}, + "f:memory"=>{}, + "f:pods"=>{} + }, + "f:capacity"=>{ + "."=>{}, + "f:attachable-volumes-azure-disk"=>{}, + "f:cpu"=>{}, + "f:ephemeral-storage"=>{}, + "f:hugepages-1Gi"=>{}, + "f:hugepages-2Mi"=>{}, + "f:memory"=>{}, + "f:pods"=>{} + }, + "f:conditions"=>{ + "."=>{}, + "k:{\"type\":\"DiskPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"MemoryPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"PIDPressure\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"Ready\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + } + }, + "f:config"=>{}, + "f:daemonEndpoints"=>{ + "f:kubeletEndpoint"=>{ + "f:Port"=>{} + } + }, + "f:images"=>{}, + "f:nodeInfo"=>{ + "f:architecture"=>{}, + "f:bootID"=>{}, + "f:containerRuntimeVersion"=>{}, + "f:kernelVersion"=>{}, + "f:kubeProxyVersion"=>{}, + "f:kubeletVersion"=>{}, + "f:machineID"=>{}, + "f:operatingSystem"=>{}, + "f:osImage"=>{}, + "f:systemUUID"=>{} + } + } + } + }, + { + "manager"=>"kubectl-label", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-07-21T23:40:53Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:metadata"=>{ + "f:labels"=>{ + "f:kubernetes.io/role"=>{}, + "f:node-role.kubernetes.io/agent"=>{} + } + } + } + }, + { + "manager"=>"node-problem-detector", + "operation"=>"Update", + "apiVersion"=>"v1", + "time"=>"2021-08-10T18:10:02Z", + "fieldsType"=>"FieldsV1", + "fieldsV1"=>{ + "f:status"=>{ + "f:conditions"=>{ + "k:{\"type\":\"ContainerRuntimeProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FilesystemCorruptionProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FreezeScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentContainerdRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentDockerRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentKubeletRestart\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"FrequentUnregisterNetDevice\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"KernelDeadlock\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"KubeletProblem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"PreemptScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"ReadonlyFilesystem\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"RebootScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"RedeployScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + }, + "k:{\"type\":\"TerminateScheduled\"}"=>{ + "."=>{}, + "f:lastHeartbeatTime"=>{}, + "f:lastTransitionTime"=>{}, + "f:message"=>{}, + "f:reason"=>{}, + "f:status"=>{}, + "f:type"=>{} + } + } + } + } + } + ] + }, + "spec"=>{ + "providerID"=>"azure:///subscriptions/3b875bf3-0eec-4d8c-bdee-25c7ccc1f130/resourceGroups/mc_davidaks16_davidaks16_westus2/providers/Microsoft.Compute/virtualMachineScaleSets/aks-nodepool1-24816391-vmss/virtualMachines/0" + }, + "status"=>{ + "capacity"=>{ + "attachable-volumes-azure-disk"=>"8", + "cpu"=>"2", + "ephemeral-storage"=>"129900528Ki", + "hugepages-1Gi"=>"0", + "hugepages-2Mi"=>"0", + "memory"=>"7120616Ki", + "pods"=>"30" + }, + "allocatable"=>{ + "attachable-volumes-azure-disk"=>"8", + "cpu"=>"1900m", + "ephemeral-storage"=>"119716326407", + "hugepages-1Gi"=>"0", + "hugepages-2Mi"=>"0", + "memory"=>"4675304Ki", + "pods"=>"30" + }, + "conditions"=>[ + { + "type"=>"FrequentDockerRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentDockerRestart", + "message"=>"docker is functioning properly" + }, + { + "type"=>"FilesystemCorruptionProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"FilesystemIsOK", + "message"=>"Filesystem is healthy" + }, + { + "type"=>"KernelDeadlock", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"KernelHasNoDeadlock", + "message"=>"kernel has no deadlock" + }, + { + "type"=>"FrequentContainerdRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentContainerdRestart", + "message"=>"containerd is functioning properly" + }, + { + "type"=>"FreezeScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-11T23:25:04Z", + "reason"=>"NoFreezeScheduled", + "message"=>"VM has no scheduled Freeze event" + }, + { + "type"=>"FrequentUnregisterNetDevice", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentUnregisterNetDevice", + "message"=>"node is functioning properly" + }, + { + "type"=>"TerminateScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoTerminateScheduled", + "message"=>"VM has no scheduled Terminate event" + }, + { + "type"=>"ReadonlyFilesystem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"FilesystemIsNotReadOnly", + "message"=>"Filesystem is not read-only" + }, + { + "type"=>"RedeployScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoRedeployScheduled", + "message"=>"VM has no scheduled Redeploy event" + }, + { + "type"=>"KubeletProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"KubeletIsUp", + "message"=>"kubelet service is up" + }, + { + "type"=>"PreemptScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:11:11Z", + "reason"=>"NoPreemptScheduled", + "message"=>"VM has no scheduled Preempt event" + }, + { + "type"=>"RebootScheduled", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoRebootScheduled", + "message"=>"VM has no scheduled Reboot event" + }, + { + "type"=>"ContainerRuntimeProblem", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"ContainerRuntimeIsUp", + "message"=>"container runtime service is up" + }, + { + "type"=>"FrequentKubeletRestart", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:25:56Z", + "lastTransitionTime"=>"2021-08-10T18:10:01Z", + "reason"=>"NoFrequentKubeletRestart", + "message"=>"kubelet is functioning properly" + }, + { + "type"=>"MemoryPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasSufficientMemory", + "message"=>"kubelet has sufficient memory available" + }, + { + "type"=>"DiskPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasNoDiskPressure", + "message"=>"kubelet has no disk pressure" + }, + { + "type"=>"PIDPressure", + "status"=>"False", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:14Z", + "reason"=>"KubeletHasSufficientPID", + "message"=>"kubelet has sufficient PID available" + }, + { + "type"=>"Ready", + "status"=>"True", + "lastHeartbeatTime"=>"2021-08-17T19:28:21Z", + "lastTransitionTime"=>"2021-07-21T23:40:24Z", + "reason"=>"KubeletReady", + "message"=>"kubelet is posting ready status. AppArmor enabled" + } + ], + "addresses"=>[ + { + "type"=>"Hostname", + "address"=>"aks-nodepool1-24816391-vmss000000" + }, + { + "type"=>"InternalIP", + "address"=>"10.240.0.4" + } + ], + "daemonEndpoints"=>{ + "kubeletEndpoint"=>{ + "Port"=>10250 + } + }, + "nodeInfo"=>{ + "machineID"=>"17a654260e2c4a9bb3a3eb4b4188e4b4", + "systemUUID"=>"7ff599e4-909e-4950-a044-ff8613af3af9", + "bootID"=>"02bb865b-a469-43cd-8b0b-5ceb4ecd80b0", + "kernelVersion"=>"5.4.0-1051-azure", + "osImage"=>"Ubuntu 18.04.5 LTS", + "containerRuntimeVersion"=>"containerd://1.4.4+azure", + "kubeletVersion"=>"v1.19.11", + "kubeProxyVersion"=>"v1.19.11", + "operatingSystem"=>"linux", + "architecture"=>"amd64" + }, + "images"=>[ + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021-1" + ], + "sizeBytes"=>331689060 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + ], + "sizeBytes"=>330099815 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021-hotfix" + ], + "sizeBytes"=>271471426 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod05202021" + ], + "sizeBytes"=>269703297 + }, + { + "names"=>[ + "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + ], + "sizeBytes"=>264732875 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/ingress/nginx-ingress-controller:0.19.0" + ], + "sizeBytes"=>166352383 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210623.2" + ], + "sizeBytes"=>147750148 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210524.1" + ], + "sizeBytes"=>146446618 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/hcp-tunnel-front:master.210427.1" + ], + "sizeBytes"=>136242776 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.8.9.5" + ], + "sizeBytes"=>101794833 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/ingress/nginx-ingress-controller:0.47.0" + ], + "sizeBytes"=>101445696 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/autoscaler/cluster-proportional-autoscaler:1.3.0_v0.0.5" + ], + "sizeBytes"=>101194562 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210623.2" + ], + "sizeBytes"=>96125176 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210524.1" + ], + "sizeBytes"=>95879501 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/exechealthz:1.2_v0.0.5" + ], + "sizeBytes"=>94348102 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.8.9.2" + ], + "sizeBytes"=>93537927 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/acc/sgx-attestation:2.0" + ], + "sizeBytes"=>91841669 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azurefile-csi:v1.4.0" + ], + "sizeBytes"=>91324193 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azurefile-csi:v1.2.0" + ], + "sizeBytes"=>89103171 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.0.1-rc3" + ], + "sizeBytes"=>86839805 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.2.0" + ], + "sizeBytes"=>86488586 + }, + { + "names"=>[ + "mcr.microsoft.com/aks/hcp/tunnel-openvpn:master.210427.1" + ], + "sizeBytes"=>86120048 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.3.0" + ], + "sizeBytes"=>81252495 + }, + { + "names"=>[ + "mcr.microsoft.com/azure-application-gateway/kubernetes-ingress:1.4.0" + ], + "sizeBytes"=>79586703 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azuredisk-csi:v1.4.0" + ], + "sizeBytes"=>78795016 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/azuredisk-csi:v1.2.0" + ], + "sizeBytes"=>76527179 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.1.8" + ], + "sizeBytes"=>75025803 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.2_hotfix" + ], + "sizeBytes"=>73533889 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.3.1" + ], + "sizeBytes"=>72242894 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.8" + ], + "sizeBytes"=>70622822 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/nvidia/k8s-device-plugin:v0.9.0" + ], + "sizeBytes"=>67291599 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.1" + ], + "sizeBytes"=>66415836 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.0-rc7" + ], + "sizeBytes"=>65965658 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/azure-npm:v1.2.1" + ], + "sizeBytes"=>64123775 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/cni:v3.8.9.3" + ], + "sizeBytes"=>63581323 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8" + ], + "sizeBytes"=>63154716 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/cni:v3.8.9.2" + ], + "sizeBytes"=>61626312 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.18.1" + ], + "sizeBytes"=>60500885 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.17.2" + ], + "sizeBytes"=>58419768 + }, + { + "names"=>[ + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8_hotfix", + "mcr.microsoft.com/containernetworking/networkmonitor:v1.1.8post2" + ], + "sizeBytes"=>56368756 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy@sha256:282543237a1aa3f407656290f454b7068a92e1abe2156082c750d5abfbcad90c", + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526.2" + ], + "sizeBytes"=>56310724 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/calico/node:v3.19.0" + ], + "sizeBytes"=>55228749 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526.1" + ], + "sizeBytes"=>54692048 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/dashboard:v2.0.0-rc3" + ], + "sizeBytes"=>50803639 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/secrets-store/driver:v0.0.19" + ], + "sizeBytes"=>49759361 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/azure/aad-pod-identity/nmi:v1.7.5" + ], + "sizeBytes"=>49704644 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes-csi/secrets-store/driver:v0.0.21" + ], + "sizeBytes"=>49372390 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kube-proxy@sha256:a64d3538b72905b07356881314755b02db3675ff47ee2bcc49dd7be856e285d5", + "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.19.11-hotfix.20210526" + ], + "sizeBytes"=>49322942 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/azure/aad-pod-identity/nmi:v1.7.4" + ], + "sizeBytes"=>48108311 + }, + { + "names"=>[ + "mcr.microsoft.com/oss/kubernetes/kubernetes-dashboard:v1.10.1" + ], + "sizeBytes"=>44907744 + } + ], + "config"=>{} + } + } + ] +} \ No newline at end of file diff --git a/test/unit-tests/run_go_tests.sh b/test/unit-tests/run_go_tests.sh new file mode 100755 index 000000000..7036531fd --- /dev/null +++ b/test/unit-tests/run_go_tests.sh @@ -0,0 +1,12 @@ +set -e + +OLD_PATH=$(pwd) +SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" +cd $SCRIPTPATH/../../source/plugins/go/src +echo "# Runnign go generate" +go generate + +echo "# Running go test ." +go test . + +cd $OLD_PATH diff --git a/test/unit-tests/run_ruby_tests.sh b/test/unit-tests/run_ruby_tests.sh new file mode 100755 index 000000000..824346eee --- /dev/null +++ b/test/unit-tests/run_ruby_tests.sh @@ -0,0 +1,13 @@ +# this script will exit with an error if any commands exit with an error +set -e + +# NOTE: to run a specific test (instead of all) use the following arguments: --name test_name +# ex: run_ruby_tests.sh --name test_basic_single_node + +OLD_PATH=$(pwd) +SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" +# cd $SCRIPTPATH/../../source/plugins/ruby +echo "# Running ruby $SCRIPTPATH/test_driver.rb $1 $2" +ruby $SCRIPTPATH/test_driver.rb $1 $2 + +cd $OLD_PATH diff --git a/test/unit-tests/test_driver.rb b/test/unit-tests/test_driver.rb new file mode 100644 index 000000000..32687cc99 --- /dev/null +++ b/test/unit-tests/test_driver.rb @@ -0,0 +1,13 @@ +$in_unit_test = true + +script_path = __dir__ +# go to the base directory of the repository +Dir.chdir(File.join(__dir__, "../..")) + +Dir.glob(File.join(script_path, "../../source/plugins/ruby/*_test.rb")) do |filename| + require_relative filename +end + +Dir.glob(File.join(script_path, "../../build/linux/installer/scripts/*_test.rb")) do |filename| + require_relative filename +end From 32f958b9db2820f662a399712422f8f519762365 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Fri, 27 Aug 2021 11:56:27 -0700 Subject: [PATCH 146/194] run unit tests after a merge too (#634) --- .github/workflows/run_unit_tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/run_unit_tests.yml b/.github/workflows/run_unit_tests.yml index 29f5afc7a..94ac4371a 100644 --- a/.github/workflows/run_unit_tests.yml +++ b/.github/workflows/run_unit_tests.yml @@ -5,6 +5,10 @@ on: branches: - ci_dev - ci_prod + push: + branches: + - ci_dev + - ci_prod jobs: Golang-Tests: runs-on: ubuntu-latest From c4a3bbc76b0241d09257b6e6fe61a5cd4df58058 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Mon, 30 Aug 2021 21:42:31 -0700 Subject: [PATCH 147/194] flag stale PRs & issues --- .github/workflows/stale.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/stale.yml diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 000000000..1d91df09d --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,28 @@ +name: Mark stale issues and pull requests + +on: + schedule: + - cron: "30 10 * * *" + +jobs: + stale: + + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + + steps: + - uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + days-before-issue-stale: 7 + days-before-pr-stale: 7 + stale-issue-message: 'This issue is stale because it has been open 7 days with no activity. Remove stale label or comment or this will be closed in 5 days.' + stale-pr-message: 'This PR is stale because it has been open 7 days with no activity. Remove stale label or comment or this will be closed in 5 days.' + close-issue-message: 'This issue was closed because it has been stalled for 12 days with no activity.' + close-pr-message: 'This PR was closed because it has been stalled for 12 days with no activity.' + days-before-issue-close: 5 + days-before-pr-close: 5 + stale-issue-label: 'no-issue-activity' + stale-pr-label: 'no-pr-activity' From beb7f424acfcd79e53bf7c58d951c642c0f949be Mon Sep 17 00:00:00 2001 From: David Michelman Date: Tue, 31 Aug 2021 17:55:55 -0700 Subject: [PATCH 148/194] Adding script to collect logs (for troubleshooting) (#636) * added script for collecting logs * added windows daemonset and prometheus sidecar, as well as some explanatory prints * added kubectl describe and kubectl logs output * changed message to make it more clear some erros are expected --- scripts/troubleshoot/collect_logs.sh | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100755 scripts/troubleshoot/collect_logs.sh diff --git a/scripts/troubleshoot/collect_logs.sh b/scripts/troubleshoot/collect_logs.sh new file mode 100755 index 000000000..99a9ad302 --- /dev/null +++ b/scripts/troubleshoot/collect_logs.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# This script pulls logs from the replicaset agent pod and a random daemonset pod. This script is to make troubleshooting faster + +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +mkdir azure-monitor-logs-tmp +cd azure-monitor-logs-tmp + +export ds_pod=$(kubectl get pods -n kube-system -o custom-columns=NAME:.metadata.name | grep -E omsagent-[a-z0-9]{5} | head -n 1) +export ds_win_pod=$(kubectl get pods -n kube-system -o custom-columns=NAME:.metadata.name | grep -E omsagent-win-[a-z0-9]{5} | head -n 1) +export rs_pod=$(kubectl get pods -n kube-system -o custom-columns=NAME:.metadata.name | grep -E omsagent-rs-[a-z0-9]{5} | head -n 1) + +echo -e "Collecting logs from ${ds_pod}, ${ds_win_pod}, and ${rs_pod}" +echo -e "${CYAN}Note: some errors about pods and files not existing are expected in clusters without windows nodes or sidecar prometheus scraping. They can safely be disregarded ${NC}" + +# grab `kubectl describe` and `kubectl log` +echo "collecting kubectl describe and kubectl log output" + +kubectl describe pod ${ds_pod} --namespace=kube-system > describe_${ds_pod}.txt +kubectl logs ${ds_pod} --container omsagent --namespace=kube-system > logs_${ds_pod}.txt +kubectl logs ${ds_pod} --container omsagent-prometheus --namespace=kube-system > logs_${ds_pod}_prom.txt + +kubectl describe pod ${ds_win_pod} --namespace=kube-system > describe_${ds_win_pod}.txt +kubectl logs ${ds_win_pod} --container omsagent-win --namespace=kube-system > logs_${ds_win_pod}.txt + +kubectl describe pod ${rs_pod} --namespace=kube-system > describe_${rs_pod}.txt +kubectl logs ${rs_pod} --container omsagent --namespace=kube-system > logs_${rs_pod}.txt + + +# now collect log files from in containers +echo "Collecting log files from inside agent containers" + +kubectl cp ${ds_pod}:/var/opt/microsoft/docker-cimprov/log omsagent-daemonset --namespace=kube-system --container omsagent +kubectl cp ${ds_pod}:/var/opt/microsoft/linuxmonagent/log omsagent-daemonset-mdsd --namespace=kube-system --container omsagent + +kubectl cp ${ds_pod}:/var/opt/microsoft/docker-cimprov/log omsagent-prom-daemonset --namespace=kube-system --container omsagent-prometheus +kubectl cp ${ds_pod}:/var/opt/microsoft/linuxmonagent/log omsagent-prom-daemonset-mdsd --namespace=kube-system --container omsagent-prometheus + +# for some reason copying logs out of /etc/omsagentwindows doesn't work (gives a permission error), but exec then cat does work. +# skip collecting these logs for now, would be good to come back and fix this next time a windows support case comes up +# kubectl cp ${ds_win_pod}:/etc/omsagentwindows omsagent-win-daemonset --namespace=kube-system +kubectl cp ${ds_win_pod}:/etc/fluent-bit omsagent-win-daemonset-fbit --namespace=kube-system + +kubectl cp ${rs_pod}:/var/opt/microsoft/docker-cimprov/log omsagent-replicaset --namespace=kube-system +kubectl cp ${rs_pod}:/var/opt/microsoft/linuxmonagent/log omsagent-replicaset-mdsd --namespace=kube-system + +zip -r -q ../azure-monitor-logs.zip * + +cd .. +rm -rf azure-monitor-logs-tmp +echo +echo "log files have been written to azure-monitor-logs.zip" From 01e8178925b3fa284952f49014e42ec3e5abb1bd Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Fri, 10 Sep 2021 17:04:27 -0700 Subject: [PATCH 149/194] Sarah/ev2 (#640) * ev2 artifacts for release pipeline * update parameters reference * add artifacts tar file * changes to rollout and service model * change agentimage path * adding agentimage to artifact script * removing charts from tarball * change script to use blob storage * change blob variables * echo variables * change blob uri * use release id for blob prefix * change to delete blob file * add check for if blob storage file exists * fix script errors * update check for file in storage * change true check * comments and change storage account info to pipeline variables * Changes for windows tar file * PR changes --- .pipelines/build-linux.sh | 6 + .../ContainerInsights.Linux.Parameters.json | 84 +++++++++++++ .../ContainerInsights.Windows.Parameters.json | 84 +++++++++++++ .../RolloutSpecs/RolloutSpecs.json | 36 ++++++ .../ScopeBindings/Public.ScopeBindings.json | 48 ++++++++ .../Scripts/pushAgentToAcr.sh | 110 ++++++++++++++++++ .../ServiceModels/Public.ServiceModel.json | 56 +++++++++ .../ServiceGroupRoot/buildver.txt | 1 + 8 files changed, 425 insertions(+) create mode 100644 deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json create mode 100644 deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json create mode 100644 deployment/agent-deployment/ServiceGroupRoot/RolloutSpecs/RolloutSpecs.json create mode 100644 deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json create mode 100644 deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh create mode 100644 deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json create mode 100644 deployment/agent-deployment/ServiceGroupRoot/buildver.txt diff --git a/.pipelines/build-linux.sh b/.pipelines/build-linux.sh index 53f6a3a07..8dbf57fdc 100644 --- a/.pipelines/build-linux.sh +++ b/.pipelines/build-linux.sh @@ -15,7 +15,13 @@ echo "----------- Build Docker Provider -------------------------------" make cd $DIR +echo "------------ Bundle Shell Extension Scripts for Agent Release -------------------------" +cd $DIR/../deployment/agent-deployment/ServiceGroupRoot/Scripts +tar -czvf ../artifacts.tar.gz pushAgentToAcr.sh +cd $DIR + echo "------------ Bundle Shell Extension Scripts & HELM chart -------------------------" cd $DIR/../deployment/arc-k8s-extension/ServiceGroupRoot/Scripts tar -czvf ../artifacts.tar.gz ../../../../charts/azuremonitor-containers/ pushChartToAcr.sh + diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json new file mode 100644 index 000000000..598ce9698 --- /dev/null +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json @@ -0,0 +1,84 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutParameters.json", + "contentVersion": "1.0.0.0", + "wait": [ + { + "name": "waitSdpBakeTime", + "properties": { + "duration": "PT24H" + } + } + ], + "shellExtensions": [ + { + "name": "PushAgentToACR", + "type": "ShellExtensionType", + "properties": { + "maxexecutiontime": "PT1H" + }, + "package": { + "reference": { + "path": "artifacts.tar.gz" + } + }, + "launch": { + "command": [ + "/bin/bash", + "pushAgentToAcr.sh" + ], + "environmentVariables": [ + { + "name": "WINDOWS", + "value": "" + }, + { + "name": "AGENT_IMAGE_URI", + "value": "__CONTAINER_URI__" + }, + { + "name": "AGENT_IMAGE_SAS", + "value": "__CONTAINER_SAS_TOKEN__" + }, + { + "name": "STORAGE_CONTAINER_NAME", + "value": "__STORAGE_CONTAINER_NAME__" + }, + { + "name": "STORAGE_ACCOUNT_NAME", + "value": "__STORAGE_ACCOUNT_NAME__" + }, + { + "name": "AGENT_IMAGE_TAR_FILE_NAME", + "value": "agentimage.tar.gz" + }, + { + "name": "RELEASE_ID", + "value": "__RELEASE_ID__" + }, + { + "name": "ACR_NAME", + "value": "__ACR_NAME__" + }, + { + "name": "AGENT_RELEASE", + "value": "__AGENT_RELEASE__" + }, + { + "name": "AGENT_IMAGE_TAG_SUFFIX", + "value": "__AGENT_IMAGE_TAG_SUFFIX__" + }, + { + "name": "AGENT_IMAGE_FULL_PATH", + "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" + } + ], + "identity": { + "type": "userAssigned", + "userAssignedIdentities": [ + "__MANAGED_IDENTITY__" + ] + } + } + } + ] + } \ No newline at end of file diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json new file mode 100644 index 000000000..9d208e0c6 --- /dev/null +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json @@ -0,0 +1,84 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutParameters.json", + "contentVersion": "1.0.0.0", + "wait": [ + { + "name": "waitSdpBakeTime", + "properties": { + "duration": "PT24H" + } + } + ], + "shellExtensions": [ + { + "name": "PushAgentToACR", + "type": "ShellExtensionType", + "properties": { + "maxexecutiontime": "PT1H" + }, + "package": { + "reference": { + "path": "artifacts.tar.gz" + } + }, + "launch": { + "command": [ + "/bin/bash", + "pushAgentToAcr.sh" + ], + "environmentVariables": [ + { + "name": "WINDOWS", + "value": "win-" + }, + { + "name": "AGENT_IMAGE_URI", + "value": "__CONTAINER_URI__" + }, + { + "name": "AGENT_IMAGE_SAS", + "value": "__CONTAINER_SAS_TOKEN__" + }, + { + "name": "STORAGE_CONTAINER_NAME", + "value": "__STORAGE_CONTAINER_NAME__" + }, + { + "name": "STORAGE_ACCOUNT_NAME", + "value": "__STORAGE_ACCOUNT_NAME__" + }, + { + "name": "AGENT_IMAGE_TAR_FILE_NAME", + "value": "agentimage.tar.zip" + }, + { + "name": "RELEASE_ID", + "value": "__RELEASE_ID__" + }, + { + "name": "ACR_NAME", + "value": "__ACR_NAME__" + }, + { + "name": "AGENT_RELEASE", + "value": "__AGENT_RELEASE__" + }, + { + "name": "AGENT_IMAGE_TAG_SUFFIX", + "value": "__AGENT_IMAGE_TAG_SUFFIX__" + }, + { + "name": "AGENT_IMAGE_FULL_PATH", + "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:win-__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" + } + ], + "identity": { + "type": "userAssigned", + "userAssignedIdentities": [ + "__MANAGED_IDENTITY__" + ] + } + } + } + ] + } \ No newline at end of file diff --git a/deployment/agent-deployment/ServiceGroupRoot/RolloutSpecs/RolloutSpecs.json b/deployment/agent-deployment/ServiceGroupRoot/RolloutSpecs/RolloutSpecs.json new file mode 100644 index 000000000..f015cf5d3 --- /dev/null +++ b/deployment/agent-deployment/ServiceGroupRoot/RolloutSpecs/RolloutSpecs.json @@ -0,0 +1,36 @@ +{ + "$schema": "https://ev2schema.azure.net/schemas/2020-01-01/rolloutSpecification.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsAgent", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "Notification": { + "Email": { + "To": "omscontainers@microsoft.com" + } + } + }, + "OrchestratedSteps": [ + { + "name": "PushLinuxAgent", + "targetType": "ServiceResource", + "targetName": "PushLinuxAgent", + "actions": [ "Shell/PushAgentToACR" ], + "dependsOn": [ ] + }, + { + "name": "PushWindowsAgent", + "targetType": "ServiceResource", + "targetName": "PushWindowsAgent", + "actions": [ "Shell/PushAgentToACR" ], + "dependsOn": [ ] + } + ] + } \ No newline at end of file diff --git a/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json b/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json new file mode 100644 index 000000000..99acfb68e --- /dev/null +++ b/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json @@ -0,0 +1,48 @@ +{ + "$schema": "https://ev2schema.azure.net/schemas/2020-01-01/scopeBindings.json", + "contentVersion": "0.0.0.1", + "scopeBindings": [ + { + "scopeTagName": "Global", + "bindings": [ + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__AGENT_RELEASE__", + "replaceWith": "$(AgentRelease)" + }, + { + "find": "__AGENT_IMAGE_TAG_SUFFIX__", + "replaceWith": "$(AgentImageTagSuffix)" + }, + { + "find": "__RELEASE_ID__", + "replaceWith": "$(Release.ReleaseId)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" + }, + { + "find": "__CONTAINER_URI__", + "replaceWith": "$(Storage.StorageContainerUri)" + }, + { + "find": "__CONTAINER_SAS_TOKEN__", + "replaceWith": "$(Storage.StorageContainerSasToken)" + }, + { + "find": "__STORAGE_CONTAINER_NAME__", + "replaceWith": "$(StorageContainerName)" + }, + { + "find": "__STORAGE_ACCOUNT_NAME__", + "replaceWith": "$(StorageAccountName)" + } + + ] + } + ] +} \ No newline at end of file diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh new file mode 100644 index 000000000..7e73a6230 --- /dev/null +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -0,0 +1,110 @@ +#!/bin/bash +set -e + +# Note - This script used in the pipeline as inline script + +if [ -z $AGENT_IMAGE_TAG_SUFFIX ]; then + echo "-e error value of AGENT_IMAGE_TAG_SUFFIX variable shouldnt be empty. check release variables" + exit 1 +fi + +if [ -z $AGENT_RELEASE ]; then + echo "-e error AGENT_RELEASE shouldnt be empty. check release variables" + exit 1 +fi + +if [ -z $AGENT_IMAGE_URI ]; then + echo "-e error value of AGENT_IMAGE_URI shouldn't be empty. check output from file copy release task" + exit 1 +fi + +if [ -z $AGENT_IMAGE_SAS ]; then + echo "-e error value of AGENT_IMAGE_SAS shouldn't be empty. check output from file copy release task" + exit 1 +fi + +if [ -z $STORAGE_CONTAINER_NAME ]; then + echo "-e error value of STORAGE_CONTAINER_NAME shouldn't be empty. check release variables" + exit 1 +fi + +if [ -z $STORAGE_ACCOUNT_NAME ]; then + echo "-e error value of STORAGE_ACCOUNT_NAME shouldn't be empty. check release variables" + exit 1 +fi + +if [ -z $ACR_NAME ]; then + echo "-e error value of ACR_NAME shouldn't be empty. check release variables" + exit 1 +fi + +#Download agentimage tarball from blob storage account +echo "Downloading tarball image from $WINDOWS $AGENT_IMAGE_URI" +wget -O $AGENT_IMAGE_TAR_FILE_NAME "${AGENT_IMAGE_URI}${WINDOWS}${RELEASE_ID}${AGENT_IMAGE_SAS}" + + +if [ ! -f $AGENT_IMAGE_TAR_FILE_NAME ]; then + echo "Agent tarfile: ${AGENT_IMAGE_TAR_FILE_NAME} does not exist, unable to continue" + exit 1 +fi + +#Install crane +echo "Installing crane" +wget -O crane.tar.gz https://github.com/google/go-containerregistry/releases/download/v0.4.0/go-containerregistry_Linux_x86_64.tar.gz +if [ $? -eq 0 ]; then + echo "crane downloaded successfully" +else + echo "-e error crane download failed" + exit 1 +fi +tar xzvf crane.tar.gz +echo "Installed crane" + + +#Login to az cli and authenticate to acr +echo "Login cli using managed identity" +az login --identity + +echo "Getting acr credentials" +TOKEN_QUERY_RES=$(az acr login -n "$ACR_NAME" -t) +TOKEN=$(echo "$TOKEN_QUERY_RES" | jq -r '.accessToken') +if [ -z $TOKEN ]; then + echo "-e error failed to get az acr login token" + exit 1 +fi + +DESTINATION_ACR=$(echo "$TOKEN_QUERY_RES" | jq -r '.loginServer') +if [ -z $DESTINATION_ACR ]; then + echo "-e error value of DESTINATION_ACR shouldnt be empty" + exit 1 +fi + +./crane auth login "$DESTINATION_ACR" -u "00000000-0000-0000-0000-000000000000" -p "$TOKEN" + +#Prepare tarball and push to acr +if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.gz"* ]]; then + gunzip $AGENT_IMAGE_TAR_FILE_NAME +fi + +if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.zip"* ]]; then + unzip $AGENT_IMAGE_TAR_FILE_NAME +fi + +echo "Pushing file $TARBALL_IMAGE_FILE to $AGENT_IMAGE_FULL_PATH" +./crane push *.tar "$AGENT_IMAGE_FULL_PATH" + + +#Delete agentimage tarball from blob storage to prevent future conflicts +echo "Deleting agentimage copy from blob storage" + +BLOB_EXIST_RESULT=$(az storage blob exists --container-name $STORAGE_CONTAINER_NAME --name $WINDOWS$RELEASE_ID --account-name $STORAGE_ACCOUNT_NAME --sas-token $AGENT_IMAGE_SAS) +BLOB_EXIST=$(echo "$BLOB_EXIST_RESULT" | jq -r '.exists') +echo $BLOB_EXIST_RESULT +echo $BLOB_EXIST + +if $BLOB_EXIST; then + az storage blob delete --container-name "${STORAGE_CONTAINER_NAME}" --name "${WINDOWS}${RELEASE_ID}" --account-name "${STORAGE_ACCOUNT_NAME}" --sas-token "${AGENT_IMAGE_SAS}" + echo "Deleted agentimate copy from blob storage" +else + echo "Agentimage has already been deleted from blob storage" +fi \ No newline at end of file diff --git a/deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json b/deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json new file mode 100644 index 000000000..b7bd4aa26 --- /dev/null +++ b/deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json @@ -0,0 +1,56 @@ +{ + "$schema": "https://ev2schema.azure.net/schemas/2020-01-01/serviceModel.json", + "contentVersion": "1.0.0.2", + "ServiceMetadata": { + "ServiceGroup": "ContainerInsightsAgent", + "Environment": "Dev" + }, + "ServiceResourceGroupDefinitions": [ + { + "Name": "CI-Agent-ServiceResourceGroupDefinition", + "ServiceResourceDefinitions": [ + { + "Name": "ShellExtension", + "ComposedOf": { + "Extension": { + "Shell": [ + { + "type": "ShellExtensionType", + "properties": { + "imageName": "adm-ubuntu-1804-l", + "imageVersion": "v18" + } + } + ] + } + } + } + ] + } + ], + "ServiceResourceGroups": [ + { + "AzureResourceGroupName": "ContainerInsightsAgent-Global-Release", + "Location": "eastus2", + "InstanceOf": "CI-Agent-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "728bbd23-3b47-40c1-8c9a-c6c5ccd674fc", + "ScopeTags": [ + { + "Name": "Global" + } + ], + "ServiceResources": [ + { + "Name": "PushLinuxAgent", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsights.Linux.Parameters.json" + }, + { + "Name": "PushWindowsAgent", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsights.Windows.Parameters.json" + } + ] + } + ] + } \ No newline at end of file diff --git a/deployment/agent-deployment/ServiceGroupRoot/buildver.txt b/deployment/agent-deployment/ServiceGroupRoot/buildver.txt new file mode 100644 index 000000000..bd2666abb --- /dev/null +++ b/deployment/agent-deployment/ServiceGroupRoot/buildver.txt @@ -0,0 +1 @@ +1.0.0.0 \ No newline at end of file From ef7cb89f7e20e7c89e5f154abe1671c0517cd2ab Mon Sep 17 00:00:00 2001 From: David Michelman Date: Mon, 13 Sep 2021 12:27:28 -0700 Subject: [PATCH 150/194] documenting fbit tail plugin configmap settings. (#638) * documenting fbit tail plugin configmap settings. --- kubernetes/container-azm-ms-agentconfig.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 21b31f76f..328acb201 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -144,6 +144,14 @@ data: tcp_listener_chunk_size = 10 tcp_listener_buffer_size = 10 tcp_listener_mem_buf_limit = 200 + + # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft. + # They increase the maximum stdout/stderr log collection rate but will also cause higher cpu/memory usage. + # [agent_settings.fbit_config] + # log_flush_interval_secs = "1" # default value is 15 + # tail_mem_buf_limit_megabytes = "10" # default value is 10 + # tail_buf_chunksize_megabytes = "1" # default value is 32kb (comment out this line for default) + # tail_buf_maxsize_megabytes = "1" # defautl value is 32kb (comment out this line for default) metadata: name: container-azm-ms-agentconfig From 6b42f139165f3fc055d966446d2d1b8ba239356d Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Mon, 13 Sep 2021 13:04:41 -0700 Subject: [PATCH 151/194] Install unzip package on shell extension (#642) --- .../Parameters/ContainerInsights.Linux.Parameters.json | 4 ---- .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 8 +++++++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json index 598ce9698..be9ddb6d6 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json @@ -27,10 +27,6 @@ "pushAgentToAcr.sh" ], "environmentVariables": [ - { - "name": "WINDOWS", - "value": "" - }, { "name": "AGENT_IMAGE_URI", "value": "__CONTAINER_URI__" diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index 7e73a6230..3d4062c91 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -87,7 +87,13 @@ if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.gz"* ]]; then fi if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.zip"* ]]; then - unzip $AGENT_IMAGE_TAR_FILE_NAME + sudo apt-get install unzip + if [ $? -eq 0 ]; then + unzip $AGENT_IMAGE_TAR_FILE_NAME + else + echo "-e error failed to install unzip package and cannot unzip windows agent tarball" + exit 1 + fi fi echo "Pushing file $TARBALL_IMAGE_FILE to $AGENT_IMAGE_FULL_PATH" From 7ef07e12d032058a4577e6b67b9ad00618859feb Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Mon, 13 Sep 2021 17:26:22 -0700 Subject: [PATCH 152/194] Changing installation in ev2 script (#644) --- .../agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index 3d4062c91..bafd62b05 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -87,7 +87,7 @@ if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.gz"* ]]; then fi if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.zip"* ]]; then - sudo apt-get install unzip + apt-get -y install unzip if [ $? -eq 0 ]; then unzip $AGENT_IMAGE_TAR_FILE_NAME else From a025ce7478e7469bede0779ec6ee9f797e0a9c44 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Tue, 21 Sep 2021 09:46:03 -0700 Subject: [PATCH 153/194] Adjust release pipeline to use cdpx acr (#647) * Adjust release pipeline to use cdpx acr * Adjust release pipeline to use cdpx acr * Update CDPX ACR path * Add check for cdpx repo variable --- .pipelines/pipeline.user.linux.yml | 3 + .pipelines/pipeline.user.windows.yml | 3 + .../ContainerInsights.Linux.Parameters.json | 40 +++---- .../ContainerInsights.Windows.Parameters.json | 44 +++----- .../ScopeBindings/Public.ScopeBindings.json | 41 ++++--- .../Scripts/pushAgentToAcr.sh | 102 +++++------------- 6 files changed, 89 insertions(+), 144 deletions(-) diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index 9977e7a1a..a1175263e 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -47,4 +47,7 @@ package: repository_name: 'cdpxlinux' # only supported ones are cdpx acr repos tag: 'cidev' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + publish_unique_tag: true # If set, the image in the registry is tagged with the unique tag generated by CDPx + metadata_file: + artifact_path: 'linux-image-meta.json' # If defined, the drop outputs relative path to the file into which JSON metadata about the created image is emitted. export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag diff --git a/.pipelines/pipeline.user.windows.yml b/.pipelines/pipeline.user.windows.yml index e9d0105ab..8be92a316 100644 --- a/.pipelines/pipeline.user.windows.yml +++ b/.pipelines/pipeline.user.windows.yml @@ -53,4 +53,7 @@ package: repository_name: 'cdpxwin1809' # only supported ones are cdpx acr repos tag: 'win-cidev' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + publish_unique_tag: true # If set, the image in the registry is tagged with the unique tag generated by CDPx + metadata_file: + artifact_path: 'windows-image-meta.json' # If defined, the drop outputs relative path to the file into which JSON metadata about the created image is emitted. export_to_artifact_path: 'agentimage.tar.zip' # path for exported image and use this instead of fixed tag diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json index be9ddb6d6..b9ca8c407 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json @@ -27,30 +27,6 @@ "pushAgentToAcr.sh" ], "environmentVariables": [ - { - "name": "AGENT_IMAGE_URI", - "value": "__CONTAINER_URI__" - }, - { - "name": "AGENT_IMAGE_SAS", - "value": "__CONTAINER_SAS_TOKEN__" - }, - { - "name": "STORAGE_CONTAINER_NAME", - "value": "__STORAGE_CONTAINER_NAME__" - }, - { - "name": "STORAGE_ACCOUNT_NAME", - "value": "__STORAGE_ACCOUNT_NAME__" - }, - { - "name": "AGENT_IMAGE_TAR_FILE_NAME", - "value": "agentimage.tar.gz" - }, - { - "name": "RELEASE_ID", - "value": "__RELEASE_ID__" - }, { "name": "ACR_NAME", "value": "__ACR_NAME__" @@ -66,6 +42,22 @@ { "name": "AGENT_IMAGE_FULL_PATH", "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" + }, + { + "name": "CDPX_ACR", + "value": "__CDPX_LINUX_ACR__" + }, + { + "name": "CDPX_REGISTRY", + "value": "__CDPX_LINUX_REGISTRY__" + }, + { + "name": "CDPX_REPO_NAME", + "value": "__CDPX_LINUX_REPO_NAME__" + }, + { + "name": "CDPX_TAG", + "value": "__CDPX_LINUX_TAG__" } ], "identity": { diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json index 9d208e0c6..f7f12218f 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json @@ -27,34 +27,6 @@ "pushAgentToAcr.sh" ], "environmentVariables": [ - { - "name": "WINDOWS", - "value": "win-" - }, - { - "name": "AGENT_IMAGE_URI", - "value": "__CONTAINER_URI__" - }, - { - "name": "AGENT_IMAGE_SAS", - "value": "__CONTAINER_SAS_TOKEN__" - }, - { - "name": "STORAGE_CONTAINER_NAME", - "value": "__STORAGE_CONTAINER_NAME__" - }, - { - "name": "STORAGE_ACCOUNT_NAME", - "value": "__STORAGE_ACCOUNT_NAME__" - }, - { - "name": "AGENT_IMAGE_TAR_FILE_NAME", - "value": "agentimage.tar.zip" - }, - { - "name": "RELEASE_ID", - "value": "__RELEASE_ID__" - }, { "name": "ACR_NAME", "value": "__ACR_NAME__" @@ -70,6 +42,22 @@ { "name": "AGENT_IMAGE_FULL_PATH", "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:win-__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" + }, + { + "name": "CDPX_ACR", + "value": "__CDPX_WINDOWS_ACR__" + }, + { + "name": "CDPX_REGISTRY", + "value": "__CDPX_WINDOWS_REGISTRY__" + }, + { + "name": "CDPX_REPO_NAME", + "value": "__CDPX_WINDOWS_REPO_NAME__" + }, + { + "name": "CDPX_TAG", + "value": "__CDPX_WINDOWS_TAG__" } ], "identity": { diff --git a/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json b/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json index 99acfb68e..82a1fae73 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json +++ b/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json @@ -17,31 +17,42 @@ "find": "__AGENT_IMAGE_TAG_SUFFIX__", "replaceWith": "$(AgentImageTagSuffix)" }, - { - "find": "__RELEASE_ID__", - "replaceWith": "$(Release.ReleaseId)" - }, { "find": "__MANAGED_IDENTITY__", "replaceWith": "$(ManagedIdentity)" }, { - "find": "__CONTAINER_URI__", - "replaceWith": "$(Storage.StorageContainerUri)" - }, + "find": "__CDPX_LINUX_ACR__", + "replaceWith": "$(CDPXLinuxACR)" + }, { - "find": "__CONTAINER_SAS_TOKEN__", - "replaceWith": "$(Storage.StorageContainerSasToken)" - }, + "find": "__CDPX_WINDOWS_ACR__", + "replaceWith": "$(CDPXWindowsACR)" + }, + { + "find": "__CDPX_LINUX_REGISTRY__", + "replaceWith": "$(CDPXLinuxRegistry)" + }, + { + "find": "__CDPX_WINDOWS_REGISTRY__", + "replaceWith": "$(CDPXWindowsRegistry)" + }, + { + "find": "__CDPX_LINUX_TAG__", + "replaceWith": "$(CDPXLinuxTag)" + }, + { + "find": "__CDPX_WINDOWS_TAG__", + "replaceWith": "$(CDPXWindowsTag)" + }, { - "find": "__STORAGE_CONTAINER_NAME__", - "replaceWith": "$(StorageContainerName)" + "find": "__CDPX_LINUX_REPO_NAME__", + "replaceWith": "$(CDPXLinuxRepoName)" }, { - "find": "__STORAGE_ACCOUNT_NAME__", - "replaceWith": "$(StorageAccountName)" + "find": "__CDPX_WINDOWS_REPO_NAME__", + "replaceWith": "$(CDPXWindowsRepoName)" } - ] } ] diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index bafd62b05..f319c3bbe 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -12,24 +12,29 @@ if [ -z $AGENT_RELEASE ]; then echo "-e error AGENT_RELEASE shouldnt be empty. check release variables" exit 1 fi +#! +if [ -z $AGENT_IMAGE_FULL_PATH ]; then + echo "-e error AGENT_IMAGE_FULL_PATH shouldnt be empty. check release variables" + exit 1 +fi -if [ -z $AGENT_IMAGE_URI ]; then - echo "-e error value of AGENT_IMAGE_URI shouldn't be empty. check output from file copy release task" +if [ -z $CDPX_ACR ]; then + echo "-e error value of CDPX_ACR shouldn't be empty. check release variables" exit 1 fi -if [ -z $AGENT_IMAGE_SAS ]; then - echo "-e error value of AGENT_IMAGE_SAS shouldn't be empty. check output from file copy release task" +if [ -z $CDPX_TAG ]; then + echo "-e error value of CDPX_TAG shouldn't be empty. check release variables" exit 1 fi -if [ -z $STORAGE_CONTAINER_NAME ]; then - echo "-e error value of STORAGE_CONTAINER_NAME shouldn't be empty. check release variables" +if [ -z $CDPX_REGISTRY ]; then + echo "-e error value of CDPX_REGISTRY shouldn't be empty. check release variables" exit 1 fi -if [ -z $STORAGE_ACCOUNT_NAME ]; then - echo "-e error value of STORAGE_ACCOUNT_NAME shouldn't be empty. check release variables" +if [ -z $CDPX_REPO_NAME ]; then + echo "-e error value of CDPX_REPO_NAME shouldn't be empty. check release variables" exit 1 fi @@ -38,79 +43,22 @@ if [ -z $ACR_NAME ]; then exit 1 fi -#Download agentimage tarball from blob storage account -echo "Downloading tarball image from $WINDOWS $AGENT_IMAGE_URI" -wget -O $AGENT_IMAGE_TAR_FILE_NAME "${AGENT_IMAGE_URI}${WINDOWS}${RELEASE_ID}${AGENT_IMAGE_SAS}" - - -if [ ! -f $AGENT_IMAGE_TAR_FILE_NAME ]; then - echo "Agent tarfile: ${AGENT_IMAGE_TAR_FILE_NAME} does not exist, unable to continue" - exit 1 -fi - -#Install crane -echo "Installing crane" -wget -O crane.tar.gz https://github.com/google/go-containerregistry/releases/download/v0.4.0/go-containerregistry_Linux_x86_64.tar.gz -if [ $? -eq 0 ]; then - echo "crane downloaded successfully" -else - echo "-e error crane download failed" - exit 1 -fi -tar xzvf crane.tar.gz -echo "Installed crane" - #Login to az cli and authenticate to acr echo "Login cli using managed identity" az login --identity - -echo "Getting acr credentials" -TOKEN_QUERY_RES=$(az acr login -n "$ACR_NAME" -t) -TOKEN=$(echo "$TOKEN_QUERY_RES" | jq -r '.accessToken') -if [ -z $TOKEN ]; then - echo "-e error failed to get az acr login token" +if [ $? -eq 0 ]; then + echo "Logged in successfully" +else + echo "-e error failed to login to az with managed identity credentials" exit 1 -fi - -DESTINATION_ACR=$(echo "$TOKEN_QUERY_RES" | jq -r '.loginServer') -if [ -z $DESTINATION_ACR ]; then - echo "-e error value of DESTINATION_ACR shouldnt be empty" +fi + +echo "Pushing ${AGENT_IMAGE_FULL_PATH} to ${ACR_NAME}" +az acr import --name $ACR_NAME --registry $CDPX_REGISTRY --source ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_TAG} --image $AGENT_IMAGE_FULL_PATH +if [ $? -eq 0 ]; then + echo "Retagged and pushed image successfully" +else + echo "-e error failed to retag and push image to destination ACR" exit 1 -fi - -./crane auth login "$DESTINATION_ACR" -u "00000000-0000-0000-0000-000000000000" -p "$TOKEN" - -#Prepare tarball and push to acr -if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.gz"* ]]; then - gunzip $AGENT_IMAGE_TAR_FILE_NAME -fi - -if [[ "$AGENT_IMAGE_TAR_FILE_NAME" == *"tar.zip"* ]]; then - apt-get -y install unzip - if [ $? -eq 0 ]; then - unzip $AGENT_IMAGE_TAR_FILE_NAME - else - echo "-e error failed to install unzip package and cannot unzip windows agent tarball" - exit 1 - fi -fi - -echo "Pushing file $TARBALL_IMAGE_FILE to $AGENT_IMAGE_FULL_PATH" -./crane push *.tar "$AGENT_IMAGE_FULL_PATH" - - -#Delete agentimage tarball from blob storage to prevent future conflicts -echo "Deleting agentimage copy from blob storage" - -BLOB_EXIST_RESULT=$(az storage blob exists --container-name $STORAGE_CONTAINER_NAME --name $WINDOWS$RELEASE_ID --account-name $STORAGE_ACCOUNT_NAME --sas-token $AGENT_IMAGE_SAS) -BLOB_EXIST=$(echo "$BLOB_EXIST_RESULT" | jq -r '.exists') -echo $BLOB_EXIST_RESULT -echo $BLOB_EXIST - -if $BLOB_EXIST; then - az storage blob delete --container-name "${STORAGE_CONTAINER_NAME}" --name "${WINDOWS}${RELEASE_ID}" --account-name "${STORAGE_ACCOUNT_NAME}" --sas-token "${AGENT_IMAGE_SAS}" - echo "Deleted agentimate copy from blob storage" -else - echo "Agentimage has already been deleted from blob storage" fi \ No newline at end of file From c6bc993d4fd652df81c5d816a801c7f133101c6a Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Wed, 22 Sep 2021 16:14:55 -0700 Subject: [PATCH 154/194] Sarah/ev2 prod (#649) * Ev2 changes for prod --- ....linux.official.all_tag.all_phase.all_config.ci_prod.yml | 3 +++ ...indows.official.all_tag.all_phase.all_config.ci_prod.yml | 3 +++ .../ServiceGroupRoot/ServiceModels/Public.ServiceModel.json | 6 +++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml index 1e9909ee8..4f73d7c71 100644 --- a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml @@ -42,4 +42,7 @@ package: repository_name: 'cdpxlinux' # only supported ones are cdpx acr repos tag: 'ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + publish_unique_tag: true # If set, the image in the registry is tagged with the unique tag generated by CDPx + metadata_file: + artifact_path: 'linux-image-meta.json' # If defined, the drop outputs relative path to the file into which JSON metadata about the created image is emitted. export_to_artifact_path: 'agentimage.tar.gz' # path for exported image and use this instead of fixed tag diff --git a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml index 0dc0a47c5..1caf60b7b 100644 --- a/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.windows.official.all_tag.all_phase.all_config.ci_prod.yml @@ -53,4 +53,7 @@ package: repository_name: 'cdpxwin1809' # only supported ones are cdpx acr repos tag: 'win-ciprod' # OPTIONAL: Defaults to latest. The tag for the built image. Final tag will be 1.0.0alpha, 1.0.0-timestamp-commitID. latest: false # OPTIONAL: Defaults to false. If tag is not set to latest and this flag is set, then tag as latest as well and push latest as well. + publish_unique_tag: true # If set, the image in the registry is tagged with the unique tag generated by CDPx + metadata_file: + artifact_path: 'windows-image-meta.json' # If defined, the drop outputs relative path to the file into which JSON metadata about the created image is emitted. export_to_artifact_path: 'agentimage.tar.zip' # path for exported image and use this instead of fixed tag diff --git a/deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json b/deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json index b7bd4aa26..8c5c7c1b6 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json +++ b/deployment/agent-deployment/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json @@ -3,7 +3,7 @@ "contentVersion": "1.0.0.2", "ServiceMetadata": { "ServiceGroup": "ContainerInsightsAgent", - "Environment": "Dev" + "Environment": "Prod" }, "ServiceResourceGroupDefinitions": [ { @@ -30,10 +30,10 @@ ], "ServiceResourceGroups": [ { - "AzureResourceGroupName": "ContainerInsightsAgent-Global-Release", + "AzureResourceGroupName": "ContainerInsights-Agent-Release", "Location": "eastus2", "InstanceOf": "CI-Agent-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "728bbd23-3b47-40c1-8c9a-c6c5ccd674fc", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "Global" From 5e379473ce4db7b81c17966624c2dd90f8119868 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Thu, 23 Sep 2021 13:25:05 -0700 Subject: [PATCH 155/194] CDPX repo naming change (#652) --- .../Parameters/ContainerInsights.Linux.Parameters.json | 4 ---- .../Parameters/ContainerInsights.Windows.Parameters.json | 4 ---- .../ScopeBindings/Public.ScopeBindings.json | 8 -------- .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 7 +------ 4 files changed, 1 insertion(+), 22 deletions(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json index b9ca8c407..6104609a6 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json @@ -43,10 +43,6 @@ "name": "AGENT_IMAGE_FULL_PATH", "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" }, - { - "name": "CDPX_ACR", - "value": "__CDPX_LINUX_ACR__" - }, { "name": "CDPX_REGISTRY", "value": "__CDPX_LINUX_REGISTRY__" diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json index f7f12218f..de0bbfe1c 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json @@ -43,10 +43,6 @@ "name": "AGENT_IMAGE_FULL_PATH", "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:win-__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" }, - { - "name": "CDPX_ACR", - "value": "__CDPX_WINDOWS_ACR__" - }, { "name": "CDPX_REGISTRY", "value": "__CDPX_WINDOWS_REGISTRY__" diff --git a/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json b/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json index 82a1fae73..cbc6db8b3 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json +++ b/deployment/agent-deployment/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json @@ -21,14 +21,6 @@ "find": "__MANAGED_IDENTITY__", "replaceWith": "$(ManagedIdentity)" }, - { - "find": "__CDPX_LINUX_ACR__", - "replaceWith": "$(CDPXLinuxACR)" - }, - { - "find": "__CDPX_WINDOWS_ACR__", - "replaceWith": "$(CDPXWindowsACR)" - }, { "find": "__CDPX_LINUX_REGISTRY__", "replaceWith": "$(CDPXLinuxRegistry)" diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index f319c3bbe..7d1b6c27e 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -18,11 +18,6 @@ if [ -z $AGENT_IMAGE_FULL_PATH ]; then exit 1 fi -if [ -z $CDPX_ACR ]; then - echo "-e error value of CDPX_ACR shouldn't be empty. check release variables" - exit 1 -fi - if [ -z $CDPX_TAG ]; then echo "-e error value of CDPX_TAG shouldn't be empty. check release variables" exit 1 @@ -55,7 +50,7 @@ else fi echo "Pushing ${AGENT_IMAGE_FULL_PATH} to ${ACR_NAME}" -az acr import --name $ACR_NAME --registry $CDPX_REGISTRY --source ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_TAG} --image $AGENT_IMAGE_FULL_PATH +az acr import --name $ACR_NAME --registry $CDPX_REGISTRY --source official/${CDPX_REPO_NAME}:${CDPX_TAG} --image $AGENT_IMAGE_FULL_PATH if [ $? -eq 0 ]; then echo "Retagged and pushed image successfully" else From a36d8dfe004f009b0645f54781a2539104ff8768 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Mon, 27 Sep 2021 15:19:29 -0700 Subject: [PATCH 156/194] Sarah/ev2 update (#654) * remove acr name from repo path * add check to make sure tag does not exist in mcr repo --- .../ContainerInsights.Linux.Parameters.json | 2 +- .../ContainerInsights.Windows.Parameters.json | 2 +- .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json index 6104609a6..70d0950a2 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Linux.Parameters.json @@ -41,7 +41,7 @@ }, { "name": "AGENT_IMAGE_FULL_PATH", - "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" + "value": "public/azuremonitor/containerinsights/__AGENT_RELEASE__:__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" }, { "name": "CDPX_REGISTRY", diff --git a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json index de0bbfe1c..b6a31ed10 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json +++ b/deployment/agent-deployment/ServiceGroupRoot/Parameters/ContainerInsights.Windows.Parameters.json @@ -41,7 +41,7 @@ }, { "name": "AGENT_IMAGE_FULL_PATH", - "value": "__ACR_NAME__/public/azuremonitor/containerinsights/__AGENT_RELEASE__:win-__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" + "value": "public/azuremonitor/containerinsights/__AGENT_RELEASE__:win-__AGENT_RELEASE____AGENT_IMAGE_TAG_SUFFIX__" }, { "name": "CDPX_REGISTRY", diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index 7d1b6c27e..c3f092d90 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -8,11 +8,24 @@ if [ -z $AGENT_IMAGE_TAG_SUFFIX ]; then exit 1 fi +#Make sure that tag being pushed will not overwrite an existing tag in mcr +MCR_TAG_RESULT="`wget -qO- https://mcr.microsoft.com/v2/azuremonitor/containerinsights/ciprod/tags/list`" +if [ $? -ne 0 ]; then + echo "-e error unable to get list of mcr tags for azuremonitor/containerinsights/ciprod repository" + exit 1 +fi +TAG_EXISTS=$(echo $MCR_TAG_RESULT | jq '.tags | contains(["'"$AGENT_IMAGE_TAG_SUFFIX"'"])') + +if $TAG_EXISTS; then + echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique" + exit 1 +fi + if [ -z $AGENT_RELEASE ]; then echo "-e error AGENT_RELEASE shouldnt be empty. check release variables" exit 1 fi -#! + if [ -z $AGENT_IMAGE_FULL_PATH ]; then echo "-e error AGENT_IMAGE_FULL_PATH shouldnt be empty. check release variables" exit 1 From fdc99f6e56aa6316fd38a9d569f69a32995c8210 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Mon, 27 Sep 2021 17:10:08 -0700 Subject: [PATCH 157/194] change tag syntax for mcr repo check (#655) --- .../ServiceGroupRoot/Scripts/pushAgentToAcr.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh index c3f092d90..d39cedde0 100644 --- a/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh +++ b/deployment/agent-deployment/ServiceGroupRoot/Scripts/pushAgentToAcr.sh @@ -8,24 +8,24 @@ if [ -z $AGENT_IMAGE_TAG_SUFFIX ]; then exit 1 fi +if [ -z $AGENT_RELEASE ]; then + echo "-e error AGENT_RELEASE shouldnt be empty. check release variables" + exit 1 +fi + #Make sure that tag being pushed will not overwrite an existing tag in mcr MCR_TAG_RESULT="`wget -qO- https://mcr.microsoft.com/v2/azuremonitor/containerinsights/ciprod/tags/list`" if [ $? -ne 0 ]; then echo "-e error unable to get list of mcr tags for azuremonitor/containerinsights/ciprod repository" exit 1 fi -TAG_EXISTS=$(echo $MCR_TAG_RESULT | jq '.tags | contains(["'"$AGENT_IMAGE_TAG_SUFFIX"'"])') +TAG_EXISTS=$(echo $MCR_TAG_RESULT | jq '.tags | contains(["'"$AGENT_RELEASE$AGENT_IMAGE_TAG_SUFFIX"'"])') if $TAG_EXISTS; then echo "-e error ${AGENT_IMAGE_TAG_SUFFIX} already exists in mcr. make sure the image tag is unique" exit 1 fi -if [ -z $AGENT_RELEASE ]; then - echo "-e error AGENT_RELEASE shouldnt be empty. check release variables" - exit 1 -fi - if [ -z $AGENT_IMAGE_FULL_PATH ]; then echo "-e error AGENT_IMAGE_FULL_PATH shouldnt be empty. check release variables" exit 1 From 6292218d9bbf5885ae854a0c02be30f5587980ac Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 27 Sep 2021 18:09:44 -0700 Subject: [PATCH 158/194] Gangams/optimize win livenessprobe (#653) * livenessprobe optimization * optimize windows agent liveness probe * optimize windows agent liveness probe * optimize windows agent liveness probe * optimize windows agent liveness probe * optimize windows agent liveness probe * optimize windows agent liveness probe * optimize windows agent liveness probe * optimize windows agent liveness probe --- build/windows/Makefile.ps1 | 19 ++- .../installer/livenessprobe/livenessprobe.cpp | 137 ++++++++++++++++++ .../installer/scripts/livenessprobe.cmd | 36 ----- kubernetes/omsagent.yaml | 6 +- kubernetes/windows/Dockerfile | 2 +- 5 files changed, 159 insertions(+), 41 deletions(-) create mode 100644 build/windows/installer/livenessprobe/livenessprobe.cpp delete mode 100644 build/windows/installer/scripts/livenessprobe.cmd diff --git a/build/windows/Makefile.ps1 b/build/windows/Makefile.ps1 index b9bd1f3e4..9f3c438b0 100644 --- a/build/windows/Makefile.ps1 +++ b/build/windows/Makefile.ps1 @@ -3,6 +3,7 @@ # 1. Builds the certificate generator code in .NET and copy the binaries in zip file to ..\..\kubernetes\windows\omsagentwindows # 2. Builds the out_oms plugin code in go lang into the shared object(.so) file and copy the out_oms.so file to ..\..\kubernetes\windows\omsagentwindows # 3. copy the files under installer directory to ..\..\kubernetes\windows\omsagentwindows +# 4. Builds the livenessprobe cpp and copy the executable to the under directory ..\..\kubernetes\windows\omsagentwindows $dotnetcoreframework = "netcoreapp3.1" @@ -157,7 +158,7 @@ if ($isCDPxEnvironment) { Write-Host("getting latest go modules ...") go get - Write-Host("successfyullt got latest go modules") -ForegroundColor Green + Write-Host("successfully got latest go modules") -ForegroundColor Green go build -ldflags "-X 'main.revision=$buildVersionString' -X 'main.builddate=$buildVersionDate'" -buildmode=c-shared -o out_oms.so . } @@ -167,16 +168,28 @@ Write-Host("copying out_oms.so file to : $publishdir") Copy-Item -Path (Join-path -Path $outomsgoplugindir -ChildPath "out_oms.so") -Destination $publishdir -Force Write-Host("successfully copied out_oms.so file to : $publishdir") -ForegroundColor Green +# compile and build the liveness probe cpp code +Write-Host("Start:build livenessprobe cpp code") +$livenessprobesrcpath = Join-Path -Path $builddir -ChildPath "windows\installer\livenessprobe\livenessprobe.cpp" +$livenessprobeexepath = Join-Path -Path $builddir -ChildPath "windows\installer\livenessprobe\livenessprobe.exe" +g++ $livenessprobesrcpath -o $livenessprobeexepath -municode +Write-Host("End:build livenessprobe cpp code") +if (Test-Path -Path $livenessprobeexepath){ + Write-Host("livenessprobe.exe exists which indicates cpp build step succeeded") -ForegroundColor Green +} else { + Write-Host("livenessprobe.exe doesnt exist which indicates cpp build step failed") -ForegroundColor Red + exit +} $installerdir = Join-Path -Path $builddir -ChildPath "common\installer" Write-Host("copying common installer files conf and scripts from :" + $installerdir + " to :" + $publishdir + " ...") -$exclude = @('*.cs','*.csproj') +$exclude = @('*.cs','*.csproj', '*.cpp') Copy-Item -Path $installerdir -Destination $publishdir -Recurse -Force -Exclude $exclude Write-Host("successfully copied installer files conf and scripts from :" + $installerdir + " to :" + $publishdir + " ") -ForegroundColor Green $installerdir = Join-Path -Path $builddir -ChildPath "windows\installer" Write-Host("copying installer files conf and scripts from :" + $installerdir + " to :" + $publishdir + " ...") -$exclude = @('*.cs','*.csproj') +$exclude = @('*.cs','*.csproj', '*.cpp') Copy-Item -Path $installerdir -Destination $publishdir -Recurse -Force -Exclude $exclude Write-Host("successfully copied installer files conf and scripts from :" + $installerdir + " to :" + $publishdir + " ") -ForegroundColor Green diff --git a/build/windows/installer/livenessprobe/livenessprobe.cpp b/build/windows/installer/livenessprobe/livenessprobe.cpp new file mode 100644 index 000000000..eea792686 --- /dev/null +++ b/build/windows/installer/livenessprobe/livenessprobe.cpp @@ -0,0 +1,137 @@ +#ifndef UNICODE +#define UNICODE +#endif + +#ifndef _UNICODE +#define _UNICODE +#endif + +#include +#include +#include + +#define SUCCESS 0x00000000 +#define NO_FLUENT_BIT_PROCESS 0x00000001 +#define FILESYSTEM_WATCHER_FILE_EXISTS 0x00000002 +#define CERTIFICATE_RENEWAL_REQUIRED 0x00000003 +#define FLUENTDWINAKS_SERVICE_NOT_RUNNING 0x00000004 +#define UNEXPECTED_ERROR 0xFFFFFFFF + +/* + check if the process running or not for given exe file name +*/ +bool IsProcessRunning(const wchar_t *const executableName) +{ + PROCESSENTRY32 entry; + entry.dwSize = sizeof(PROCESSENTRY32); + + const auto snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, NULL); + + if (!Process32First(snapshot, &entry)) + { + CloseHandle(snapshot); + wprintf_s(L"ERROR:IsProcessRunning::Process32First failed"); + return false; + } + + do + { + if (!_wcsicmp(entry.szExeFile, executableName)) + { + CloseHandle(snapshot); + return true; + } + } while (Process32Next(snapshot, &entry)); + + CloseHandle(snapshot); + return false; +} + +/* + check if the file exists +*/ +bool IsFileExists(const wchar_t *const fileName) +{ + DWORD dwAttrib = GetFileAttributes(fileName); + return dwAttrib != INVALID_FILE_SIZE; +} + +/* + Get the status of the service for given service name +*/ +int GetServiceStatus(const wchar_t *const serivceName) +{ + SC_HANDLE theService, scm; + SERVICE_STATUS_PROCESS ssStatus; + DWORD dwBytesNeeded; + + scm = OpenSCManager(nullptr, nullptr, SC_MANAGER_ENUMERATE_SERVICE); + if (!scm) + { + wprintf_s(L"ERROR:GetServiceStatus::OpenSCManager failed"); + return UNEXPECTED_ERROR; + } + + theService = OpenService(scm, serivceName, SERVICE_QUERY_STATUS); + if (!theService) + { + CloseServiceHandle(scm); + wprintf_s(L"ERROR:GetServiceStatus::OpenService failed"); + return UNEXPECTED_ERROR; + } + + auto result = QueryServiceStatusEx(theService, SC_STATUS_PROCESS_INFO, + reinterpret_cast(&ssStatus), sizeof(SERVICE_STATUS_PROCESS), + &dwBytesNeeded); + + CloseServiceHandle(theService); + CloseServiceHandle(scm); + + if (result == 0) + { + wprintf_s(L"ERROR:GetServiceStatus:QueryServiceStatusEx failed"); + return UNEXPECTED_ERROR; + } + + return ssStatus.dwCurrentState; +} + +/** + +**/ +int _tmain(int argc, wchar_t *argv[]) +{ + if (argc < 5) + { + wprintf_s(L"ERROR:unexpected number arguments and expected is 5"); + return UNEXPECTED_ERROR; + } + + if (!IsProcessRunning(argv[1])) + { + wprintf_s(L"ERROR:Process:%s is not running\n", argv[1]); + return NO_FLUENT_BIT_PROCESS; + } + + DWORD dwStatus = GetServiceStatus(argv[2]); + + if (dwStatus != SERVICE_RUNNING) + { + wprintf_s(L"ERROR:Service:%s is not running\n", argv[2]); + return FLUENTDWINAKS_SERVICE_NOT_RUNNING; + } + + if (IsFileExists(argv[3])) + { + wprintf_s(L"INFO:File:%s exists indicates Config Map Updated since agent started.\n", argv[3]); + return FILESYSTEM_WATCHER_FILE_EXISTS; + } + + if (IsFileExists(argv[4])) + { + wprintf_s(L"INFO:File:%s exists indicates Certificate needs to be renewed.\n", argv[4]); + return CERTIFICATE_RENEWAL_REQUIRED; + } + + return SUCCESS; +} diff --git a/build/windows/installer/scripts/livenessprobe.cmd b/build/windows/installer/scripts/livenessprobe.cmd deleted file mode 100644 index 19d0b69d7..000000000 --- a/build/windows/installer/scripts/livenessprobe.cmd +++ /dev/null @@ -1,36 +0,0 @@ -REM "Checking if fluent-bit is running" - -tasklist /fi "imagename eq fluent-bit.exe" /fo "table" | findstr fluent-bit - -IF ERRORLEVEL 1 ( - echo "Fluent-Bit is not running" - exit /b 1 -) - -REM "Checking if config map has been updated since agent start" - -IF EXIST C:\etc\omsagentwindows\filesystemwatcher.txt ( - echo "Config Map Updated since agent started" - exit /b 1 -) - -REM "Checking if certificate needs to be renewed (aka agent restart required)" - -IF EXIST C:\etc\omsagentwindows\renewcertificate.txt ( - echo "Certificate needs to be renewed" - exit /b 1 -) - -REM "Checking if fluentd service is running" -sc query fluentdwinaks | findstr /i STATE | findstr RUNNING - -IF ERRORLEVEL 1 ( - echo "Fluentd Service is NOT Running" - exit /b 1 -) - -exit /b 0 - - - - diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index d84e46701..98621b5f0 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -833,7 +833,11 @@ spec: command: - cmd - /c - - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd + - C:\opt\omsagentwindows\scripts\cmd\livenessprobe.exe + - fluent-bit.exe + - fluentdwinaks + - "C:\\etc\\omsagentwindows\\filesystemwatcher.txt" + - "C:\\etc\\omsagentwindows\\renewcertificate.txt" periodSeconds: 60 initialDelaySeconds: 180 timeoutSeconds: 15 diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 290deef40..aa756b8b8 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -46,7 +46,7 @@ RUN ./setup.ps1 COPY main.ps1 /opt/omsagentwindows/scripts/powershell COPY ./omsagentwindows/installer/scripts/filesystemwatcher.ps1 /opt/omsagentwindows/scripts/powershell -COPY ./omsagentwindows/installer/scripts/livenessprobe.cmd /opt/omsagentwindows/scripts/cmd/ +COPY ./omsagentwindows/installer/livenessprobe/livenessprobe.exe /opt/omsagentwindows/scripts/cmd/ COPY setdefaulttelegrafenvvariables.ps1 /opt/omsagentwindows/scripts/powershell # copy ruby scripts to /opt folder From cfacf39842a7f621621196f7049293fb1d493846 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 28 Sep 2021 19:45:48 -0700 Subject: [PATCH 159/194] Gangams/addon token adapter image tag to telemetry (#656) * addon token adapter image tag * addon token adapter image tag --- source/plugins/ruby/ApplicationInsightsUtility.rb | 3 +++ source/plugins/ruby/in_containerinventory.rb | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index eaa1d903d..7691304a6 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -243,6 +243,9 @@ def sendTelemetry(pluginName, properties) getContainerRuntimeInfo() end @@CustomProperties["Computer"] = properties["Computer"] + if !properties["addonTokenAdapterImageTag"].nil? && !properties["addonTokenAdapterImageTag"].empty? + @@CustomProperties["addonTokenAdapterImageTag"] = properties["addonTokenAdapterImageTag"] + end sendHeartBeatEvent(pluginName) sendLastProcessedContainerInventoryCountMetric(pluginName, properties) rescue => errorStr diff --git a/source/plugins/ruby/in_containerinventory.rb b/source/plugins/ruby/in_containerinventory.rb index 9fcb7ab90..f52ed4026 100644 --- a/source/plugins/ruby/in_containerinventory.rb +++ b/source/plugins/ruby/in_containerinventory.rb @@ -57,6 +57,7 @@ def enumerate containerInventory = Array.new eventStream = Fluent::MultiEventStream.new hostName = "" + addonTokenAdapterImageTag = "" $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") if ExtensionUtils.isAADMSIAuthMode() $log.info("in_container_inventory::enumerate: AAD AUTH MSI MODE") @@ -82,6 +83,15 @@ def enumerate if hostName.empty? && !containerRecord["Computer"].empty? hostName = containerRecord["Computer"] end + if addonTokenAdapterImageTag.empty? && ExtensionUtils.isAADMSIAuthMode() + if !containerRecord["ElementName"].nil? && !containerRecord["ElementName"].empty? && + containerRecord["ElementName"].include?("kube-system") && + containerRecord["ElementName"].include?("addon-token-adapter_omsagent") + if !containerRecord["ImageTag"].nil? && !containerRecord["ImageTag"].empty? + addonTokenAdapterImageTag = containerRecord["ImageTag"] + end + end + end containerIds.push containerRecord["InstanceID"] containerInventory.push containerRecord end @@ -117,6 +127,9 @@ def enumerate telemetryProperties = {} telemetryProperties["Computer"] = hostName telemetryProperties["ContainerCount"] = containerInventory.length + if !addonTokenAdapterImageTag.empty? + telemetryProperties["addonTokenAdapterImageTag"] = addonTokenAdapterImageTag + end ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) end rescue => errorStr From ae9ebd7647758b7d1dc6938da0a61733a4185f44 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Thu, 30 Sep 2021 09:46:32 -0700 Subject: [PATCH 160/194] Sarah/ev2 helm (#658) * Use MSI for Arc Release * Use CIPROD_ACR AME subscription for shell extension * remove extra line endings --- ...ContainerInsightsExtension.Parameters.json | 30 +++++-------------- .../Public.Canary.RolloutSpec.json | 2 +- .../ScopeBindings/Public.ScopeBindings.json | 30 ++++++++++++++++++- .../Scripts/pushChartToAcr.sh | 20 +++++++++++-- .../ServiceModels/Public.ServiceModel.json | 16 +++++----- 5 files changed, 64 insertions(+), 34 deletions(-) diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json index a8a99e9f6..69e1bcf35 100644 --- a/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json @@ -31,26 +31,6 @@ "name": "RELEASE_STAGE", "value": "__RELEASE_STAGE__" }, - { - "name": "ACR_APP_ID", - "reference": { - "provider": "AzureKeyVault", - "parameters": { - "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappid/e8f47bf7505741ebaf65a4db16ff9fa7" - } - }, - "asSecureValue": "true" - }, - { - "name": "ACR_APP_SECRET", - "reference": { - "provider": "AzureKeyVault", - "parameters": { - "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappsecret/8718afcdac114accb8b26f613cef1e1e" - } - }, - "asSecureValue": "true" - }, { "name": "ACR_NAME", "value": "__ACR_NAME__" @@ -59,8 +39,14 @@ "name": "CHART_VERSION", "value": "__CHART_VERSION__" } - ] + ], + "identity": { + "type": "userAssigned", + "userAssignedIdentities": [ + "__MANAGED_IDENTITY__" + ] + } } } ] -} +} \ No newline at end of file diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json index cde103633..2d0149e24 100644 --- a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json @@ -26,4 +26,4 @@ "dependsOn": [ ] } ] -} +} \ No newline at end of file diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json index 516eba3e2..bf61ab7fd 100644 --- a/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json @@ -16,6 +16,10 @@ { "find": "__CHART_VERSION__", "replaceWith": "$(ChartVersion)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" } ] }, @@ -33,6 +37,10 @@ { "find": "__CHART_VERSION__", "replaceWith": "$(ChartVersion)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" } ] }, @@ -50,6 +58,10 @@ { "find": "__CHART_VERSION__", "replaceWith": "$(ChartVersion)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" } ] }, @@ -67,6 +79,10 @@ { "find": "__CHART_VERSION__", "replaceWith": "$(ChartVersion)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" } ] }, @@ -84,6 +100,10 @@ { "find": "__CHART_VERSION__", "replaceWith": "$(ChartVersion)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" } ] }, @@ -101,6 +121,10 @@ { "find": "__CHART_VERSION__", "replaceWith": "$(ChartVersion)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" } ] }, @@ -118,8 +142,12 @@ { "find": "__CHART_VERSION__", "replaceWith": "$(ChartVersion)" + }, + { + "find": "__MANAGED_IDENTITY__", + "replaceWith": "$(ManagedIdentity)" } ] } ] -} +} \ No newline at end of file diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh index 520557592..99421b122 100644 --- a/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh @@ -106,8 +106,24 @@ echo "START - Release stage : ${RELEASE_STAGE}" echo "Using acr : ${ACR_NAME}" echo "Using acr repo type: ${REPO_TYPE}" +#Login to az cli and authenticate to acr +echo "Login cli using managed identity" +az login --identity +if [ $? -eq 0 ]; then + echo "Logged in successfully" +else + echo "-e error az login with managed identity credentials failed. Please review the Ev2 pipeline logs for more details on the error." + exit 1 +fi + +ACCESS_TOKEN=$(az acr login --name ${ACR_NAME} --expose-token --output tsv --query accessToken) +if [ $? -ne 0 ]; then + echo "-e error az acr login failed. Please review the Ev2 pipeline logs for more details on the error." + exit 1 +fi + echo "login to acr:${ACR_NAME} using helm ..." -echo $ACR_APP_SECRET | helm registry login $ACR_NAME --username $ACR_APP_ID --password-stdin +echo $ACCESS_TOKEN | helm registry login $ACR_NAME -u 00000000-0000-0000-0000-000000000000 --password-stdin if [ $? -eq 0 ]; then echo "login to acr:${ACR_NAME} using helm completed successfully." else @@ -178,4 +194,4 @@ case $RELEASE_STAGE in ;; esac -echo "END - Release stage : ${RELEASE_STAGE}" +echo "END - Release stage : ${RELEASE_STAGE}" \ No newline at end of file diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json index 71081661a..6f565d4c4 100644 --- a/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json @@ -33,7 +33,7 @@ "AzureResourceGroupName": "ContainerInsightsExtension-Canary-Release", "Location": "eastus2", "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "Canary" @@ -51,7 +51,7 @@ "AzureResourceGroupName": "ContainerInsightsExtension-Pilot-Release", "Location": "eastus2", "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "Pilot" @@ -69,7 +69,7 @@ "AzureResourceGroupName": "ContainerInsightsExtension-LightLoad-Release", "Location": "eastus2", "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "LightLoad" @@ -87,7 +87,7 @@ "AzureResourceGroupName": "ContainerInsightsExtension-MediumLoad-Release", "Location": "eastus2", "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "MediumLoad" @@ -105,7 +105,7 @@ "AzureResourceGroupName": "ContainerInsightsExtension-HighLoad-Release", "Location": "eastus2", "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "HighLoad" @@ -123,7 +123,7 @@ "AzureResourceGroupName": "ContainerInsightsExtension-FF-Release", "Location": "eastus2", "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "FF" @@ -141,7 +141,7 @@ "AzureResourceGroupName": "ContainerInsightsExtension-MC-Release", "Location": "eastus2", "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", - "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "AzureSubscriptionId": "30c56c3a-54da-46ea-b004-06eb33432687", "ScopeTags": [ { "Name": "MC" @@ -156,4 +156,4 @@ ] } ] - } + } \ No newline at end of file From a6c6c4a9bf5e0c676f164761da836941a3f41995 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Mon, 4 Oct 2021 15:37:17 -0700 Subject: [PATCH 161/194] Sarah/ev2 pipeline (#661) * testing build artifact dir changes * add .pipelines directory and omsagent.yaml to build artifacts --- ...fficial.all_tag.all_phase.all_config.ci_prod.yml | 13 +++++++++++++ .pipelines/pipeline.user.linux.yml | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml index 4f73d7c71..9aed01213 100644 --- a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml @@ -28,6 +28,19 @@ build: name: 'Build Docker Provider Shell Bundle' command: '.pipelines/build-linux.sh' fail_on_stderr: false + artifacts: + - from: 'deployment' + to: 'build' + include: + - '**' + - from: '.pipelines' + to: 'build' + include: + - '*.sh' + - from: 'kubernetes' + to: 'build' + include: + - '*.yaml' package: commands: diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index a1175263e..7acd7da74 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -33,6 +33,14 @@ build: to: 'build' include: - '**' + - from: '.pipelines' + to: 'build' + include: + - '*.sh' + - from: 'kubernetes' + to: 'build' + include: + - '*.yaml' package: commands: From 9e2df4d3d1193374b49d69e332fa2e7df7b55c2d Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Mon, 4 Oct 2021 16:13:06 -0700 Subject: [PATCH 162/194] add charts directory to build artifacts (#662) --- ...er.linux.official.all_tag.all_phase.all_config.ci_prod.yml | 4 ++++ .pipelines/pipeline.user.linux.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml index 9aed01213..a199bd860 100644 --- a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml @@ -41,6 +41,10 @@ build: to: 'build' include: - '*.yaml' + - from: 'charts' + to: 'build' + include: + - '**' package: commands: diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index 7acd7da74..60c1f7640 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -41,6 +41,10 @@ build: to: 'build' include: - '*.yaml' + - from: 'charts' + to: 'build' + include: + - '**' package: commands: From f1d0e4334c5b035cc15b9a9d93e905febf3dde2c Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Wed, 6 Oct 2021 16:05:00 -0700 Subject: [PATCH 163/194] Sarah/remove cdpx creds (#664) * don't use cdpx acr creds from kv * add e2etest.yaml to build output * keep cdpx creds for now --- ...er.linux.official.all_tag.all_phase.all_config.ci_prod.yml | 4 ++++ .pipelines/pipeline.user.linux.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml index a199bd860..61785f38d 100644 --- a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml @@ -45,6 +45,10 @@ build: to: 'build' include: - '**' + - from: 'test/e2e' + to: 'build' + include: + - '*.yaml' package: commands: diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index 60c1f7640..4c39fad5a 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -45,6 +45,10 @@ build: to: 'build' include: - '**' + - from: 'test/e2e' + to: 'build' + include: + - '*.yaml' package: commands: From 6ff747cb2d2dc47933823ee2948118fbf9d6df41 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 7 Oct 2021 13:09:30 -0700 Subject: [PATCH 164/194] chart updates for rbac api version change (#660) * chart updates for rbac api version change * include windows ds for arc --- .../templates/omsagent-crd.yaml | 2 +- .../templates/omsagent-daemonset-windows.yaml | 4 ++-- .../templates/omsagent-rbac.yaml | 10 +++++++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/charts/azuremonitor-containers/templates/omsagent-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-crd.yaml index bbaf89a52..46c5341cc 100644 --- a/charts/azuremonitor-containers/templates/omsagent-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-crd.yaml @@ -1,4 +1,4 @@ -{{- if semverCompare "<1.19-0" .Capabilities.KubeVersion.GitVersion }} +{{- if semverCompare "<1.19-0" .Capabilities.KubeVersion.Version }} apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 580ef9d15..efed76f7d 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -1,4 +1,4 @@ -{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId ""))}} +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} apiVersion: apps/v1 kind: DaemonSet metadata: @@ -32,7 +32,7 @@ spec: options: - name: ndots value: "3" -{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} +{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.Version }} nodeSelector: kubernetes.io/os: windows {{- else }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index c0a6e3722..d9bca069d 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -10,7 +10,11 @@ metadata: heritage: {{ .Release.Service }} --- kind: ClusterRole +{{- if .Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }} +apiVersion: rbac.authorization.k8s.io/v1 +{{- else }} apiVersion: rbac.authorization.k8s.io/v1beta1 +{{- end }} metadata: name: omsagent-reader labels: @@ -33,7 +37,7 @@ rules: verbs: ["get", "create", "patch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] -#arc k8s extension model grants access as part of the extension msi +#arc k8s extension model grants access as part of the extension msi #remove this explicit permission once the extension available in public preview {{- if (empty .Values.Azure.Extension.Name) }} - apiGroups: [""] @@ -43,7 +47,11 @@ rules: {{- end }} --- kind: ClusterRoleBinding +{{- if .Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }} +apiVersion: rbac.authorization.k8s.io/v1 +{{- else }} apiVersion: rbac.authorization.k8s.io/v1beta1 +{{- end }} metadata: name: omsagentclusterrolebinding labels: From f77587a3f3470de8593fa89809ffcf3ae10afb83 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Fri, 8 Oct 2021 10:44:32 -0700 Subject: [PATCH 165/194] proxy support (for non-aks) (#665) * changes related to aad msi auth feature * use existing envvars * fix imds token expiry interval * initial proxy support * merge? * cleaning up some files which should've merged differently * proxy should be working, but most tables don't have any data. About to merge, maybe whatever was wrong is now fixed * linux AMA proxy works * about to merge * proxy support appears to be working, final mdsd build location will still change * removing some unnecessary changes * forgot to remove one last change * redirected mdsd stderr to stdout instead of stdin * addressing proxy password location comment Co-authored-by: Ganga Mahesh Siddem --- kubernetes/linux/main.sh | 17 ++++++++++++++++- kubernetes/linux/setup.sh | 4 ++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 4986e3113..a9184ab53 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -195,6 +195,21 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then else echo "successfully validated provided proxy endpoint is valid and expected format" fi + + echo $pwd > /opt/microsoft/docker-cimprov/proxy_password + + export MDSD_PROXY_MODE=application + echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >> ~/.bashrc + export MDSD_PROXY_ADDRESS=$proto$hostport + echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >> ~/.bashrc + export MDSD_PROXY_USERNAME=$user + echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >> ~/.bashrc + export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password + echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc + + #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD + export MDSD_ODS_COMPRESSION_LEVEL=0 + echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >> ~/.bashrc fi if [ ! -z "$PROXY_ENDPOINT" ]; then @@ -563,7 +578,7 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then else echo "starting mdsd mode in main container..." # add -T 0xFFFF for full traces - mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos 2>> /dev/null & fi # Set up a cron job for logrotation diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index c14007d35..371d26fa5 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -9,8 +9,8 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -#install oneagent - Official bits (08/04/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/06242021-oneagent/azure-mdsd_1.10.3-build.master.257_x86_64.deb +#install oneagent - Official bits (10/7/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/1.14/azure-mdsd_1.14.0-build.master.279_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d From 34f5c52e5968e44793d31969ddd7c820b6722553 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 8 Oct 2021 14:23:25 -0700 Subject: [PATCH 166/194] Gangams/agent release ciprod10082021 & win-ciprod10082021 (#666) * updates for the release ciprod10082021 and win-ciprod10082021 * updates for the release ciprod10082021 and win-ciprod10082021 * updates for the release ciprod10082021 and win-ciprod10082021 * updates for the release ciprod10082021 and win-ciprod10082021 --- ReleaseNotes.md | 31 ++++++++++++++++++++ kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 16 ++++++---- kubernetes/windows/Dockerfile | 2 +- source/plugins/ruby/in_containerinventory.rb | 12 ++++---- 5 files changed, 50 insertions(+), 13 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index dc42e7d51..0fd0f7948 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,37 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 10/08/2021 - +##### Version microsoft/oms:ciprod10082021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10082021 (linux) +##### Version microsoft/oms:win-ciprod10082021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10082021 (windows) +##### Code change log +- Linux Agent + - MDSD Proxy support for non-AKS + - log rotation for mdsd log files {err,warn, info & qos} + - Onboarding status + - AAD Auth MSI changes (not usable externally yet) + - Upgrade k8s and adx go packages to fix vulnerabilities + - Fix missing telegraf metrics (TelegrafMetricsSentCount & TelegrafMetricsSendErrorCount) in mdsd route + - Improve fluentd liveness probe checks to handle both supervisor and worker process + - Fix telegraf startup issue when endpoint is unreachable +- Windows Agent + - Windows liveness probe optimization +- Common + - Add new metrics to MDM for allocatable % calculation of cpu and memory usage +- Other changes + - Helm chart updates for removal of rbac api version and deprecation of.Capabilities.KubeVersion.GitVersion to .Capabilities.KubeVersion.Version + - Updates to build and release ev2 + - Scripts to collect troubleshooting logs + - Unit test tooling + - Yaml updates in parity with aks rp yaml + - upgrade golang version for windows in pipelines + - Conformance test updates + +### 09/02/2021 - +##### Version microsoft/oms:ciprod08052021-1 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021-1 (linux) +##### Code change log +- Bumping image tag for some tooling (no code changes except the IMAGE_TAG environment variable) + ### 08/05/2021 - ##### Version microsoft/oms:ciprod08052021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021 (linux) ##### Code change log diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 07af7f4a7..fd408b9b2 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod08052021 +ARG IMAGE_TAG=ciprod10082021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 98621b5f0..97e32c0e1 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -368,16 +368,22 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10082021" imagePullPolicy: IfNotPresent resources: limits: cpu: 500m - memory: 600Mi + memory: 750Mi requests: cpu: 75m - memory: 225Mi + memory: 325Mi env: + - name: FBIT_SERVICE_FLUSH_INTERVAL + value: "15" + - name: FBIT_TAIL_BUFFER_CHUNK_SIZE + value: "1" + - name: FBIT_TAIL_BUFFER_MAX_SIZE + value: "1" # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" @@ -597,7 +603,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod08052021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10082021" imagePullPolicy: IfNotPresent resources: limits: @@ -770,7 +776,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06112021-2" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10082021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index aa756b8b8..76667f389 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod06112021-2 +ARG IMAGE_TAG=win-ciprod10082021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/source/plugins/ruby/in_containerinventory.rb b/source/plugins/ruby/in_containerinventory.rb index f52ed4026..c8ffe7d05 100644 --- a/source/plugins/ruby/in_containerinventory.rb +++ b/source/plugins/ruby/in_containerinventory.rb @@ -19,6 +19,7 @@ def initialize require_relative "CAdvisorMetricsAPIClient" require_relative "kubernetes_container_inventory" require_relative "extension_utils" + @addonTokenAdapterImageTag = "" end config_param :run_interval, :time, :default => 60 @@ -57,7 +58,6 @@ def enumerate containerInventory = Array.new eventStream = Fluent::MultiEventStream.new hostName = "" - addonTokenAdapterImageTag = "" $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") if ExtensionUtils.isAADMSIAuthMode() $log.info("in_container_inventory::enumerate: AAD AUTH MSI MODE") @@ -83,12 +83,12 @@ def enumerate if hostName.empty? && !containerRecord["Computer"].empty? hostName = containerRecord["Computer"] end - if addonTokenAdapterImageTag.empty? && ExtensionUtils.isAADMSIAuthMode() + if @addonTokenAdapterImageTag.empty? && ExtensionUtils.isAADMSIAuthMode() if !containerRecord["ElementName"].nil? && !containerRecord["ElementName"].empty? && - containerRecord["ElementName"].include?("kube-system") && + containerRecord["ElementName"].include?("_kube-system_") && containerRecord["ElementName"].include?("addon-token-adapter_omsagent") if !containerRecord["ImageTag"].nil? && !containerRecord["ImageTag"].empty? - addonTokenAdapterImageTag = containerRecord["ImageTag"] + @addonTokenAdapterImageTag = containerRecord["ImageTag"] end end end @@ -127,8 +127,8 @@ def enumerate telemetryProperties = {} telemetryProperties["Computer"] = hostName telemetryProperties["ContainerCount"] = containerInventory.length - if !addonTokenAdapterImageTag.empty? - telemetryProperties["addonTokenAdapterImageTag"] = addonTokenAdapterImageTag + if !@addonTokenAdapterImageTag.empty? + telemetryProperties["addonTokenAdapterImageTag"] = @addonTokenAdapterImageTag end ApplicationInsightsUtility.sendTelemetry(@@PluginName, telemetryProperties) end From c4d22548d7280591db3f45241a09ff8727aa7297 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Fri, 8 Oct 2021 15:03:49 -0700 Subject: [PATCH 167/194] use buildcommand for prod pipeline (#668) --- ...user.linux.official.all_tag.all_phase.all_config.ci_prod.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml index 61785f38d..97390298c 100644 --- a/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml +++ b/.pipelines/pipeline.user.linux.official.all_tag.all_phase.all_config.ci_prod.yml @@ -24,7 +24,7 @@ restore: build: commands: - - !!defaultcommand + - !!buildcommand name: 'Build Docker Provider Shell Bundle' command: '.pipelines/build-linux.sh' fail_on_stderr: false From 3b008e5e0ce0c62c0a1d015bb029019d47cc2da5 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 11 Oct 2021 10:29:16 -0700 Subject: [PATCH 168/194] fixed merge issues. (#671) (#672) * fix merge conflicts * update with newimage tag --- ReleaseNotes.md | 4 ++-- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 6 +++--- kubernetes/windows/Dockerfile | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 0fd0f7948..98b1ef3ce 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -12,8 +12,8 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) ### 10/08/2021 - -##### Version microsoft/oms:ciprod10082021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10082021 (linux) -##### Version microsoft/oms:win-ciprod10082021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10082021 (windows) +##### Version microsoft/oms:ciprod10092021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10092021 (linux) +##### Version microsoft/oms:win-ciprod10092021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10092021 (windows) ##### Code change log - Linux Agent - MDSD Proxy support for non-AKS diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index fd408b9b2..c3f952d4e 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod10082021 +ARG IMAGE_TAG=ciprod10092021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 97e32c0e1..e7b632d04 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10082021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10092021" imagePullPolicy: IfNotPresent resources: limits: @@ -603,7 +603,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10082021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10092021" imagePullPolicy: IfNotPresent resources: limits: @@ -776,7 +776,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10082021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10092021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 76667f389..0e6591e3f 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod10082021 +ARG IMAGE_TAG=win-ciprod10092021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From d16d84b9e6afa81c169d638aeab948e9b5c8d418 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 11 Oct 2021 14:26:47 -0700 Subject: [PATCH 169/194] changes related to mdsd version update (#673) (#674) --- ReleaseNotes.md | 6 +++--- kubernetes/linux/Dockerfile | 2 +- kubernetes/linux/setup.sh | 2 +- kubernetes/omsagent.yaml | 6 +++--- kubernetes/windows/Dockerfile | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 98b1ef3ce..3e08481ee 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,9 +11,9 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 10/08/2021 - -##### Version microsoft/oms:ciprod10092021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10092021 (linux) -##### Version microsoft/oms:win-ciprod10092021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10092021 (windows) +### 10/11/2021 - +##### Version microsoft/oms:ciprod10112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10112021 (linux) +##### Version microsoft/oms:win-ciprod10112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10112021 (windows) ##### Code change log - Linux Agent - MDSD Proxy support for non-AKS diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index c3f952d4e..9b2241c7b 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod10092021 +ARG IMAGE_TAG=ciprod10112021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 371d26fa5..7baae7954 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -10,7 +10,7 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ update-locale LANG=en_US.UTF-8 #install oneagent - Official bits (10/7/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/1.14/azure-mdsd_1.14.0-build.master.279_x86_64.deb +wget https://github.com/microsoft/Docker-Provider/releases/download/1.14/azure-mdsd_1.14.1-build.master.283_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index e7b632d04..a608b8f0c 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10092021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10112021" imagePullPolicy: IfNotPresent resources: limits: @@ -603,7 +603,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10092021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10112021" imagePullPolicy: IfNotPresent resources: limits: @@ -776,7 +776,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10092021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10112021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 0e6591e3f..5b187d91a 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod10092021 +ARG IMAGE_TAG=win-ciprod10112021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From ce65f2cbfe4e37dd5f203dd8f0372b782d346920 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Tue, 12 Oct 2021 12:48:55 -0700 Subject: [PATCH 170/194] Sarah/enable metrics (#675) * add user assigned msi to yaml for pipeline * update placeholders --- .pipelines/update-place-holders-in-yaml.sh | 5 +++++ kubernetes/omsagent.yaml | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.pipelines/update-place-holders-in-yaml.sh b/.pipelines/update-place-holders-in-yaml.sh index 906303667..6b962bf72 100755 --- a/.pipelines/update-place-holders-in-yaml.sh +++ b/.pipelines/update-place-holders-in-yaml.sh @@ -10,6 +10,7 @@ do case "$KEY" in ClusterResourceId) ClusterResourceId=$VALUE ;; ClusterRegion) ClusterRegion=$VALUE ;; + UserAssignedIdentityClientId) UserAssignedIdentityClientId=$VALUE ;; CIRelease) CI_RELEASE=$VALUE ;; CIImageTagSuffix) CI_IMAGE_TAG_SUFFIX=$VALUE ;; *) @@ -24,6 +25,10 @@ echo "clusterRegion:$ClusterRegion" echo "replace cluster region" sed -i "s/VALUE_AKS_RESOURCE_REGION_VALUE/$ClusterRegion/g" omsagent.yaml +echo "userAssignedIdentityClientId:$UserAssignedIdentityClientId" +echo "replace user assigned identity client id" +sed -i "s=VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE=$UserAssignedIdentityClientId=g" omsagent.yaml + echo "replace linux agent image" linuxAgentImageTag=$CI_RELEASE$CI_IMAGE_TAG_SUFFIX echo "Linux Agent Image Tag:"$linuxAgentImageTag diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index a608b8f0c..616dcc889 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -403,7 +403,7 @@ spec: fieldPath: status.hostIP # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" - name: AZMON_CONTAINERLOGS_ONEAGENT_REGIONS value: "koreacentral,norwayeast,eastus2" - name: USING_AAD_MSI_AUTH @@ -486,7 +486,7 @@ spec: # fieldPath: status.hostIP # # Update this with the user assigned msi client id for omsagent # - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - # value: "" + # value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" # - name: USING_AAD_MSI_AUTH # value: "false" # securityContext: @@ -631,7 +631,7 @@ spec: fieldPath: status.hostIP # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" # Add the below environment variable to true only in sidecar enabled regions, else set it to false - name: SIDECAR_SCRAPING_ENABLED value: "false" @@ -808,7 +808,7 @@ spec: value: "false" # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" # Add this only for clouds that require cert bootstrapping # - name: REQUIRES_CERT_BOOTSTRAP # value: "true" From 608f92e9e8ba8d6223d7428630748da0ebfe4df9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 12 Oct 2021 17:58:14 -0700 Subject: [PATCH 171/194] Gangams/chart updates oct2021 release (#676) * chart updates for oct2021 release * wip * wip * wip --- charts/azuremonitor-containers/Chart.yaml | 2 +- .../templates/omsagent-daemonset-windows.yaml | 8 ++- .../templates/omsagent-daemonset.yaml | 67 +++++++++++++++++++ .../templates/omsagent-deployment.yaml | 8 +-- charts/azuremonitor-containers/values.yaml | 29 +++++--- 5 files changed, 97 insertions(+), 17 deletions(-) diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 00f3f49ed..4dd6623bf 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.8.3 +version: 2.9.0 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index efed76f7d..78831aa10 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -86,7 +86,7 @@ spec: fieldRef: fieldPath: metadata.name - name: SIDECAR_SCRAPING_ENABLED - value: "false" + value: {{ .Values.omsagent.sidecarscraping | quote }} volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers @@ -104,7 +104,11 @@ spec: command: - cmd - /c - - C:\opt\omsagentwindows\scripts\cmd\livenessProbe.cmd + - C:\opt\omsagentwindows\scripts\cmd\livenessprobe.exe + - fluent-bit.exe + - fluentdwinaks + - "C:\\etc\\omsagentwindows\\filesystemwatcher.txt" + - "C:\\etc\\omsagentwindows\\renewcertificate.txt" periodSeconds: 60 initialDelaySeconds: 180 timeoutSeconds: 15 diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 7201ee6ae..8e5513f91 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -132,6 +132,69 @@ spec: initialDelaySeconds: 60 periodSeconds: 60 timeoutSeconds: 15 + {{- if .Values.omsagent.sidecarscraping }} + - name: omsagent-prometheus + {{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }} + image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.image.tag }}" + {{- else }} + image: {{ printf "%s:%s" .Values.omsagent.image.repo .Values.omsagent.image.tag }} + {{- end }} + imagePullPolicy: IfNotPresent + resources: +{{ toYaml .Values.omsagent.resources.daemonsetlinuxsidecar | indent 9 }} + env: + {{- if ne .Values.omsagent.env.clusterId "" }} + - name: AKS_RESOURCE_ID + value: {{ .Values.omsagent.env.clusterId | quote }} + {{- if ne .Values.omsagent.env.clusterRegion "" }} + - name: AKS_REGION + value: {{ .Values.omsagent.env.clusterRegion | quote }} + {{- end }} + {{- else if ne .Values.Azure.Cluster.ResourceId "" }} + - name: AKS_RESOURCE_ID + value: {{ .Values.Azure.Cluster.ResourceId | quote }} + {{- if ne .Values.Azure.Cluster.Region "" }} + - name: AKS_REGION + value: {{ .Values.Azure.Cluster.Region | quote }} + {{- end }} + {{- else }} + - name: ACS_RESOURCE_NAME + value: {{ .Values.omsagent.env.clusterName | quote }} + {{- end }} + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: CONTAINER_TYPE + value: "PrometheusSidecar" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: ISTEST + value: {{ .Values.omsagent.ISTEST | quote }} + securityContext: + privileged: true + volumeMounts: + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 15 + {{- end }} {{- with .Values.omsagent.daemonset.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} @@ -173,4 +236,8 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: osm-settings-vol-config + configMap: + name: container-azm-ms-osmconfig + optional: true {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index fdc520cba..1eaf7f652 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -69,14 +69,14 @@ spec: fieldPath: status.hostIP {{- if not (empty .Values.Azure.Extension.Name) }} - name: ARC_K8S_EXTENSION_NAME - value: {{ .Values.Azure.Extension.Name | quote }} - {{- end }} + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID value: "" - name: SIDECAR_SCRAPING_ENABLED - value: "false" + value: {{ .Values.omsagent.sidecarscraping | quote }} - name: ISTEST - value: {{ .Values.omsagent.ISTEST | quote }} + value: {{ .Values.omsagent.ISTEST | quote }} securityContext: privileged: true ports: diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 9dd5317a4..0d78ed50f 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -17,14 +17,14 @@ Azure: httpProxy: "" httpsProxy: "" noProxy: "" - proxyCert: "" + proxyCert: "" omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod04222021" - tagWindows: "win-ciprod04222021" + tag: "ciprod10112021" + tagWindows: "win-ciprod10112021" pullPolicy: IfNotPresent - dockerProviderVersion: "15.0.0-0" + dockerProviderVersion: "16.0.0-0" agentVersion: "1.10.0.1" # The priority used by the omsagent priority class for the daemonset pods @@ -39,7 +39,7 @@ omsagent: # chance to build pod for the node and give it to the scheduler) # Should be some number greater than default (0) priority: 10 - + # This used for running agent pods in test mode. # if set to true additional agent workflow logs will be emitted which are used for e2e and arc k8s conformance testing ISTEST: false @@ -58,10 +58,11 @@ omsagent: clusterId: clusterRegion: rbac: true + sidecarscraping: true logsettings: - logflushintervalsecs: "" - tailbufchunksizemegabytes: "" - tailbufmaxsizemegabytes: "" + logflushintervalsecs: "15" + tailbufchunksizemegabytes: "1" + tailbufmaxsizemegabytes: "1" ## Applicable for only Azure Stack Edge K8s since it has custom mount path for container logs which will have symlink to /var/log path custommountpath: "" @@ -171,10 +172,10 @@ omsagent: daemonsetlinux: requests: cpu: 75m - memory: 225Mi + memory: 325Mi limits: cpu: 150m - memory: 600Mi + memory: 750Mi daemonsetwindows: limits: cpu: 200m @@ -186,3 +187,11 @@ omsagent: limits: cpu: 1 memory: 1Gi + daemonsetlinuxsidecar: + limits: + cpu: 500m + memory: 1Gi + requests: + cpu: 75m + memory: 225Mi + From ab98c4b6eb61bda5ec6633e0ff6b3ea1121c41f2 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 13 Oct 2021 16:53:01 -0700 Subject: [PATCH 172/194] Gangams/msi mode mdsd crash fix (#677) * update mdsd version which has fix for crash in msi mode * image tag updates --- ReleaseNotes.md | 6 +++--- charts/azuremonitor-containers/values.yaml | 4 ++-- kubernetes/linux/Dockerfile | 2 +- kubernetes/linux/setup.sh | 2 +- kubernetes/omsagent.yaml | 8 ++++---- kubernetes/windows/Dockerfile | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 3e08481ee..c8a147044 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,9 +11,9 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) -### 10/11/2021 - -##### Version microsoft/oms:ciprod10112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10112021 (linux) -##### Version microsoft/oms:win-ciprod10112021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10112021 (windows) +### 10/13/2021 - +##### Version microsoft/oms:ciprod10132021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021 (linux) +##### Version microsoft/oms:win-ciprod10132021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10132021 (windows) ##### Code change log - Linux Agent - MDSD Proxy support for non-AKS diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 0d78ed50f..3ca313d38 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -21,8 +21,8 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod10112021" - tagWindows: "win-ciprod10112021" + tag: "ciprod10132021" + tagWindows: "win-ciprod10132021" pullPolicy: IfNotPresent dockerProviderVersion: "16.0.0-0" agentVersion: "1.10.0.1" diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 9b2241c7b..90acb4959 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod10112021 +ARG IMAGE_TAG=ciprod10132021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 7baae7954..243677dd0 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -10,7 +10,7 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ update-locale LANG=en_US.UTF-8 #install oneagent - Official bits (10/7/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/1.14/azure-mdsd_1.14.1-build.master.283_x86_64.deb +wget https://github.com/microsoft/Docker-Provider/releases/download/1.14/azure-mdsd_1.14.2-build.master.284_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 616dcc889..66f8c4010 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" imagePullPolicy: IfNotPresent resources: limits: @@ -454,7 +454,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode # - name: omsagent-prometheus - # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06112021" + # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" # imagePullPolicy: IfNotPresent # resources: # limits: @@ -603,7 +603,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" imagePullPolicy: IfNotPresent resources: limits: @@ -776,7 +776,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10112021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10132021" imagePullPolicy: IfNotPresent resources: limits: diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 5b187d91a..0ddf67ab2 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod10112021 +ARG IMAGE_TAG=win-ciprod10132021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From a105a00827331d15122feb0c7af7e0d90861ce52 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 19 Oct 2021 18:08:48 -0700 Subject: [PATCH 173/194] update to use extension GA api version (#679) --- .../arc-k8s-extension/existingClusterOnboarding.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json index 95e7ba5d0..b2b61f4ab 100644 --- a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -13,7 +13,7 @@ "metadata": { "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" } - }, + }, "workspaceResourceId": { "type": "string", "metadata": { @@ -83,7 +83,7 @@ "subscriptionId": "[split(parameters('clusterResourceId'),'/')[2]]", "resourceGroup": "[split(parameters('clusterResourceId'),'/')[4]]", "dependsOn": [ - "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]" + "[Concat('ContainerInsights', '-', uniqueString(parameters('workspaceResourceId')))]" ], "properties": { "mode": "Incremental", @@ -95,7 +95,7 @@ "resources": [ { "type": "Microsoft.KubernetesConfiguration/extensions", - "apiVersion": "2020-07-01-preview", + "apiVersion": "2021-09-01", "name": "azuremonitor-containers", "location": "[parameters('clusterRegion')]", "identity": {"type": "systemassigned"}, @@ -107,7 +107,7 @@ }, "configurationProtectedSettings": { "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", - "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" }, "autoUpgradeMinorVersion": true, "releaseTrain": "Stable", From 87ff2813008caea661b8529d411a69ea7c443bd0 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 19 Oct 2021 18:09:07 -0700 Subject: [PATCH 174/194] Gangams/arm template msi onboarding (#659) * wip * wip * working * working * working * working * working * working * shorten dcr prefix to DCR- to handle default workspace name length * use MSCI- prefix similar to MSVMI- for dcr --- .../existingClusterOnboarding.json | 210 ++++++++++++++++++ .../existingClusterParam.json | 32 +++ 2 files changed, 242 insertions(+) create mode 100644 scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json create mode 100644 scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json diff --git a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json new file mode 100644 index 000000000..c77e3203d --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json @@ -0,0 +1,210 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "type": "string", + "metadata": { + "description": "AKS Cluster Resource ID" + } + }, + "aksResourceLocation": { + "type": "string", + "metadata": { + "description": "Location of the AKS resource e.g. \"East US\"" + } + }, + "aksResourceTagValues": { + "type": "object", + "metadata": { + "description": "Existing all tags on AKS Cluster Resource" + } + }, + "workspaceLocation": { + "type": "string", + "metadata": { + "description": "Worksapce Location for data collection rule" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Full Resource ID of the log analitycs workspace that will be used for data destination. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.operationalinsights/workspaces/ws_xyz" + } + }, + "dcrResourceTagValues": { + "type": "object", + "metadata": { + "description": "Existing or new tags on DCR Cluster Resource" + } + } + }, + "variables": { + "clusterSubscriptionId": "[split(parameters('aksResourceId'),'/')[2]]", + "clusterResourceGroup": "[split(parameters('aksResourceId'),'/')[4]]", + "clusterName": "[split(parameters('aksResourceId'),'/')[8]]", + "workspaceSubscriptionId": "[split(parameters('workspaceResourceId'),'/')[2]]", + "workspaceResourceGroup": "[split(parameters('workspaceResourceId'),'/')[4]]", + "dcrName": "[Concat('MSCI', '-', split(parameters('workspaceResourceId'),'/')[8])]", + "associationName": "ContainerInsightsExtension", + "dataCollectionRuleId": "[resourceId(variables('workspaceSubscriptionId'), variables('workspaceResourceGroup'), 'Microsoft.Insights/dataCollectionRules', variables('dcrName'))]" + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-msi-dcr', '-', uniqueString(variables('dcrName')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('workspaceSubscriptionId')]", + "resourceGroup": "[variables('workspaceResourceGroup')]", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.Insights/dataCollectionRules", + "apiVersion": "2019-11-01-preview", + "name": "[variables('dcrName')]", + "location": "[parameters('workspaceLocation')]", + "tags": "[parameters('dcrResourceTagValues')]", + "kind": "Linux", + "properties": { + "dataSources": { + "extensions": [ + { + "name": "ContainerInsightsExtension", + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "extensionName": "ContainerInsights" + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "workspaceResourceId": "[parameters('workspaceResourceId')]", + "name": "ciworkspace" + } + ] + }, + "dataFlows": [ + { + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "destinations": [ + "ciworkspace" + ] + } + ] + } + } + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-msi-dcra', '-', uniqueString(parameters('aksResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('clusterSubscriptionId')]", + "resourceGroup": "[variables('clusterResourceGroup')]", + "dependsOn": [ + "[Concat('aks-monitoring-msi-dcr', '-', uniqueString(variables('dcrName')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.ContainerService/managedClusters/providers/dataCollectionRuleAssociations", + "name": "[concat(variables('clusterName'),'/microsoft.insights/', variables('associationName'))]", + "apiVersion": "2019-11-01-preview", + "properties": { + "description": "Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster.", + "dataCollectionRuleId": "[variables('dataCollectionRuleId')]" + } + } + + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('aks-monitoring-msi-addon', '-', uniqueString(parameters('aksResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('clusterSubscriptionId')]", + "resourceGroup": "[variables('clusterResourceGroup')]", + "dependsOn": [ + "[Concat('aks-monitoring-msi-dcra', '-', uniqueString(parameters('aksResourceId')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "name": "[variables('clusterName')]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('aksResourceLocation')]", + "tags": "[parameters('aksResourceTagValues')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[parameters('aksResourceId')]", + "addonProfiles": { + "omsagent": { + "enabled": true, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]", + "useAADAuth": "true" + } + } + } + } + } + ] + }, + "parameters": {} + } + } + ] +} diff --git a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json new file mode 100644 index 000000000..31f0f9c49 --- /dev/null +++ b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json @@ -0,0 +1,32 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "value": "/subscriptions//resourcegroups//providers/Microsoft.ContainerService/managedClusters/" + }, + "aksResourceLocation": { + "value": "" + }, + "aksResourceTagValues": { + "value": { + "": "", + "": "", + "": "" + } + }, + "workspaceResourceId": { + "value": "/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/" + }, + "workspaceLocation": { + "value": "" + }, + "dcrResourceTagValues": { + "value": { + "": "", + "": "", + "": "" + } + } + } + } From ac5dec34dd35f5afa083ff0004caf19e1264d965 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 21 Oct 2021 17:40:54 -0700 Subject: [PATCH 175/194] Gangams/conf test updates to handle sidecar (#681) * wip * test updates * fix pr feedback * fix pr feedback --- test/e2e/conformance.yaml | 2 +- test/e2e/src/common/constants.py | 3 +++ test/e2e/src/common/kubernetes_pod_utility.py | 6 +++--- test/e2e/src/core/Dockerfile | 2 +- test/e2e/src/tests/test_ds_workflows.py | 2 +- test/e2e/src/tests/test_resource_status.py | 4 ++++ test/e2e/src/tests/test_rs_workflows.py | 4 +--- 7 files changed, 14 insertions(+), 9 deletions(-) diff --git a/test/e2e/conformance.yaml b/test/e2e/conformance.yaml index ff790e690..71e40a6a2 100644 --- a/test/e2e/conformance.yaml +++ b/test/e2e/conformance.yaml @@ -3,7 +3,7 @@ sonobuoy-config: plugin-name: azure-arc-ci-conformance result-format: junit spec: - image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest08142021 + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest10202021 imagePullPolicy: Always name: plugin resources: {} diff --git a/test/e2e/src/common/constants.py b/test/e2e/src/common/constants.py index 392b10554..c557a1c91 100644 --- a/test/e2e/src/common/constants.py +++ b/test/e2e/src/common/constants.py @@ -40,6 +40,9 @@ TIMEOUT = 300 +# omsagent main container name +OMSAGENT_MAIN_CONTAINER_NAME = 'omsagent' + # WAIT TIME BEFORE READING THE AGENT LOGS AGENT_WAIT_TIME_SECS = "180" # Azure Monitor for Container Extension related diff --git a/test/e2e/src/common/kubernetes_pod_utility.py b/test/e2e/src/common/kubernetes_pod_utility.py index 27345fae7..d70f443f0 100644 --- a/test/e2e/src/common/kubernetes_pod_utility.py +++ b/test/e2e/src/common/kubernetes_pod_utility.py @@ -20,12 +20,12 @@ def get_pod_list(api_instance, namespace, label_selector=""): pytest.fail("Error occurred when retrieving pod information: " + str(e)) # get the content of the log file in the container via exec -def get_log_file_content(api_instance, namespace, podName, logfilePath): +def get_log_file_content(api_instance, namespace, podName, containerName, logfilePath): try: exec_command = ['tar','cf', '-', logfilePath] - return stream(api_instance.connect_get_namespaced_pod_exec, podName, namespace, command=exec_command, stderr=True, stdin=False, stdout=True, tty=False) + return stream(api_instance.connect_get_namespaced_pod_exec, podName, namespace, command=exec_command, container=containerName, stderr=True, stdin=False, stdout=True, tty=False) except Exception as e: - pytest.fail("Error occurred when retrieving log file content: " + str(e)) + pytest.fail("Error occurred when retrieving log file content: " + str(e)) # Function that watches events corresponding to pods in the given namespace and passes the events to a callback function def watch_pod_status(api_instance, namespace, timeout, callback=None): diff --git a/test/e2e/src/core/Dockerfile b/test/e2e/src/core/Dockerfile index cd85aee40..52bcd7cf8 100644 --- a/test/e2e/src/core/Dockerfile +++ b/test/e2e/src/core/Dockerfile @@ -6,7 +6,7 @@ RUN curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | && helm version RUN apt-get update && apt-get -y upgrade && \ - apt-get -f -y install curl apt-transport-https lsb-release gnupg python3-pip python-pip && \ + apt-get -f -y install curl apt-transport-https lsb-release gnupg python3-pip && \ curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > /etc/apt/trusted.gpg.d/microsoft.asc.gpg && \ CLI_REPO=$(lsb_release -cs) && \ echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ ${CLI_REPO} main" \ diff --git a/test/e2e/src/tests/test_ds_workflows.py b/test/e2e/src/tests/test_ds_workflows.py index 731957788..e6d651e49 100755 --- a/test/e2e/src/tests/test_ds_workflows.py +++ b/test/e2e/src/tests/test_ds_workflows.py @@ -51,7 +51,7 @@ def test_ds_workflows(env_dict): for podItem in pod_list.items: podName = podItem.metadata.name logcontent = get_log_file_content( - api_instance, constants.AGENT_RESOURCES_NAMESPACE, podName, agentLogPath) + api_instance, constants.AGENT_RESOURCES_NAMESPACE, podName, constants.OMSAGENT_MAIN_CONTAINER_NAME, agentLogPath) if not logcontent: pytest.fail("logcontent should not be null or empty for pod: " + podName) loglines = logcontent.split("\n") diff --git a/test/e2e/src/tests/test_resource_status.py b/test/e2e/src/tests/test_resource_status.py index f2b5569e9..c240cbcf2 100755 --- a/test/e2e/src/tests/test_resource_status.py +++ b/test/e2e/src/tests/test_resource_status.py @@ -1,5 +1,6 @@ import pytest import constants +import time from kubernetes import client, config from results_utility import append_result_output @@ -21,6 +22,9 @@ def test_resource_status(env_dict): except Exception as e: pytest.fail("Error loading the in-cluster config: " + str(e)) + waitTimeSeconds = env_dict['AGENT_WAIT_TIME_SECS'] + time.sleep(int(waitTimeSeconds)) + # checking the deployment status check_kubernetes_deployment_status( constants.AGENT_RESOURCES_NAMESPACE, constants.AGENT_DEPLOYMENT_NAME, env_dict['TEST_AGENT_LOG_FILE']) diff --git a/test/e2e/src/tests/test_rs_workflows.py b/test/e2e/src/tests/test_rs_workflows.py index 36ec05867..6a29dcc73 100755 --- a/test/e2e/src/tests/test_rs_workflows.py +++ b/test/e2e/src/tests/test_rs_workflows.py @@ -39,9 +39,7 @@ def test_rs_workflows(env_dict): waitTimeSeconds = env_dict['AGENT_WAIT_TIME_SECS'] - print("start: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) time.sleep(int(waitTimeSeconds)) - print("complete: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) isOMSBaseAgent = env_dict.get('USING_OMSAGENT_BASE_AGENT') agentLogPath = constants.AGENT_FLUENTD_LOG_PATH @@ -49,7 +47,7 @@ def test_rs_workflows(env_dict): agentLogPath = constants.AGENT_OMSAGENT_LOG_PATH logcontent = get_log_file_content( - api_instance, constants.AGENT_RESOURCES_NAMESPACE, rspodName, agentLogPath) + api_instance, constants.AGENT_RESOURCES_NAMESPACE, rspodName, constants.OMSAGENT_MAIN_CONTAINER_NAME, agentLogPath) if not logcontent: pytest.fail("logcontent should not be null or empty for rs pod: {}".format(rspodName)) loglines = logcontent.split("\n") From 0bd3056e54ee82a113d6dbe65825e9665728bc26 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 26 Oct 2021 14:47:15 -0700 Subject: [PATCH 176/194] Fix scan break due to latest trivy changes --- .github/workflows/pr-checker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-checker.yml b/.github/workflows/pr-checker.yml index bae117dbe..8a7e542b3 100644 --- a/.github/workflows/pr-checker.yml +++ b/.github/workflows/pr-checker.yml @@ -56,7 +56,7 @@ jobs: format: 'table' severity: 'CRITICAL,HIGH' vuln-type: 'os,library' - skip-dirs: 'opt/telegraf,usr/sbin/telegraf' + skip-dirs: '/opt,/usr/sbin' exit-code: '1' timeout: '5m0s' WINDOWS-build: From 761b6412bfdbf5e8dcbf93b47385d2d4b1811983 Mon Sep 17 00:00:00 2001 From: Anders Johansen Date: Tue, 26 Oct 2021 16:35:52 -0700 Subject: [PATCH 177/194] Anjohans/configurable database name (#663) * First cut at an implementation * Reverting a change * Moving a few lines to better align with cluster URI config * Moving a few lines to better align with cluster URI config * Adding an extra check that won't hurt * Getting ADX database name from config rather than from secret * Reverse the mangling done by editor * Fixes to the code for reading the db name setting * More fixes to the rb code for settings * Tweaked and tested * Code review * Review follow-up * Remove whitespace --- build/common/installer/scripts/tomlparser.rb | 21 ++++++++++++++++++++ source/plugins/go/src/oms.go | 20 ++++++++++++++++++- source/plugins/go/src/utils.go | 2 +- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index b173ecfe3..32ea09aa3 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -26,6 +26,7 @@ @containerLogSchemaVersion = "" @collectAllKubeEvents = false @containerLogsRoute = "v2" # default for linux +@adxDatabaseName = "containerinsights" # default for all configurations if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 @containerLogsRoute = "v1" # default is v1 for windows until windows agent integrates windows ama end @@ -175,6 +176,23 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for container logs route - #{errorStr}, using defaults, please check config map for errors") end + #Get ADX database name setting + begin + if !parsedConfig[:log_collection_settings][:adx_database].nil? && !parsedConfig[:log_collection_settings][:adx_database][:name].nil? + if !parsedConfig[:log_collection_settings][:adx_database][:name].empty? + @adxDatabaseName = parsedConfig[:log_collection_settings][:adx_database][:name] + puts "config::Using config map setting for ADX database name : #{@adxDatabaseName}" + else + puts "config::Ignoring config map settings and using default value '#{@adxDatabaseName}' since provided adx database name value is empty" + end + else + puts "config::No ADX database name set, using default value : #{@adxDatabaseName}" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for adx database name - #{errorStr}, using default #{@adxDatabaseName}, please check config map for errors") + end + + end end end @@ -218,6 +236,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") file.write("export AZMON_CONTAINER_LOG_SCHEMA_VERSION=#{@containerLogSchemaVersion}\n") + file.write("export AZMON_ADX_DATABASE_NAME=#{@adxDatabaseName}\n") # Close file after writing all environment variables file.close puts "Both stdout & stderr log collection are turned off for namespaces: '#{@excludePath}' " @@ -266,6 +285,8 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOG_SCHEMA_VERSION', @containerLogSchemaVersion) file.write(commands) + commands = get_command_windows('AZMON_ADX_DATABASE_NAME', @adxDatabaseName) + file.write(commands) # Close file after writing all environment variables file.close diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 91a5b4b40..ee221a60b 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -117,6 +117,9 @@ const MdsdOutputStreamIdTagPrefix = "dcr-" //env variable to container type const ContainerTypeEnv = "CONTAINER_TYPE" +//Default ADX destination database name, can be overriden through configuration +const DefaultAdxDatabaseName = "containerinsights" + var ( // PluginConfiguration the plugins configuration PluginConfiguration map[string]string @@ -166,6 +169,8 @@ var ( AdxTenantID string //ADX client secret AdxClientSecret string + //ADX destination database name, default is DefaultAdxDatabaseName, can be overridden in configuration + AdxDatabaseName string // container log or container log v2 tag name for oneagent route MdsdContainerLogTagName string // kubemonagent events tag name for oneagent route @@ -1698,6 +1703,17 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { ContainerLogsRouteADX = false if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { + // Try to read the ADX database name from environment variables. Default to DefaultAdsDatabaseName if not set. + // This SHOULD be set by tomlparser.rb so it's a highly unexpected event if it isn't. + // It should be set by the logic in tomlparser.rb EVEN if ADX logging isn't enabled + AdxDatabaseName := strings.TrimSpace(os.Getenv("AZMON_ADX_DATABASE_NAME")) + + // Check the len of the provided name for database and use default if 0, just to be sure + if len(AdxDatabaseName) == 0 { + Log("Adx database name unexpecedly empty (check config AND implementation, should have been set by tomlparser.rb?) - will default to '%s'", DefaultAdxDatabaseName) + AdxDatabaseName = DefaultAdxDatabaseName + } + //check if adx clusteruri, clientid & secret are set var err error AdxClusterUri, err = ReadFileContents(PluginConfiguration["adx_cluster_uri_path"]) @@ -1708,6 +1724,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Invalid AdxClusterUri %s", AdxClusterUri) AdxClusterUri = "" } + AdxClientID, err = ReadFileContents(PluginConfiguration["adx_client_id_path"]) if err != nil { Log("Error when reading AdxClientID %s", err) @@ -1723,7 +1740,8 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Error when reading AdxClientSecret %s", err) } - if len(AdxClusterUri) > 0 && len(AdxClientID) > 0 && len(AdxClientSecret) > 0 && len(AdxTenantID) > 0 { + // AdxDatabaseName should never get in a state where its length is 0, but it doesn't hurt to add the check + if len(AdxClusterUri) > 0 && len(AdxClientID) > 0 && len(AdxClientSecret) > 0 && len(AdxTenantID) > 0 && len(AdxDatabaseName) > 0 { ContainerLogsRouteADX = true Log("Routing container logs thru %s route...", ContainerLogsADXRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route...\n", ContainerLogsADXRoute) diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 6b3036f85..61c6898d7 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -192,7 +192,7 @@ func CreateADXClient() { //log.Fatalf("Unable to create ADX connection %s", err.Error()) } else { Log("Successfully created ADX Client. Creating Ingestor...") - ingestor, ingestorErr := ingest.New(client, "containerinsights", "ContainerLogV2") + ingestor, ingestorErr := ingest.New(client, AdxDatabaseName, "ContainerLogV2") if ingestorErr != nil { Log("Error::mdsd::Unable to create ADX ingestor %s", ingestorErr.Error()) } else { From fc955b31c49ffd23bbba60c35405d622ad1ab4a9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 26 Oct 2021 17:40:51 -0700 Subject: [PATCH 178/194] Gangams/troubelshooting script for arc k8s (#682) * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * doc updates * doc updates * wip * wip * update repo for issues * fix minor one --- scripts/troubleshoot/README.md | 32 +- scripts/troubleshoot/troubleshooterrors.sh | 485 +++++++++++++++++++++ 2 files changed, 516 insertions(+), 1 deletion(-) create mode 100644 scripts/troubleshoot/troubleshooterrors.sh diff --git a/scripts/troubleshoot/README.md b/scripts/troubleshoot/README.md index 5ffa07639..650a5df6f 100644 --- a/scripts/troubleshoot/README.md +++ b/scripts/troubleshoot/README.md @@ -1,5 +1,14 @@ # Troubleshoot Guide for Azure Monitor for containers +# Azure Arc-enabled Kubernetes +The table below summarizes known issues you may face while using Azure Monitor for containers . + +| Issues and Error Messages | Action | +| ---- | --- | +| Error Message `No data for selected filters` | It may take some time to establish monitoring data flow for newly created clusters. Please allow at least 10-15 minutes for data to appear for your cluster. | +| Error Message `Error retrieving data` | While Azure Arc-enabled Kubernetes cluster is setting up for health and performance monitoring, a connection is established between the cluster and Azure Log Analytics workspace. Log Analytics workspace is used to store all monitoring data for your cluster. This error may occurr when your Log Analytics workspace has been deleted or lost. Please check whether your Log Analytics workspace is available. To find your Log Analytics workspace go [here.](https://docs.microsoft.com/en-us/azure/log-analytics/log-analytics-manage-access) and your workspace is available. If the workspace is missing, you will have to delete and create Microsoft.AzureMonitor.Containers extension https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters?toc=/azure/azure-arc/kubernetes/toc.json. | + + # Azure Kubernetes Service (AKS) The table below summarizes known issues you may face while using Azure Monitor for containers . @@ -67,5 +76,26 @@ Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respon For more details on Azure Resource Manager template deployment via cli refer to [this documentation](https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-template-deploy-cli). If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: -* File a [GitHub Issue](https://github.com/Microsoft/OMS-docker/issues) +* File a [GitHub Issue](https://github.com/microsoft/Docker-Provider/issues) * Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.txt in the email generated by the troubleshooting script if you had tried running the script to solve your problem. + +# Azure Arc-enabled Kubernetes + +You can use the troubleshooting script provided [here](https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/troubleshoot/troubleshooterrors.sh) to diagnose the problem. + +Steps: +- Before executing the Troubleshooting script, please install following pre-requisistes if you dont have already + - Install [Azure-CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) + - Install [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) + - Install [jq](https://stedolan.github.io/jq/download/) +- Download and execute the script + ``` bash + curl -LO https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_dev/scripts/troubleshoot/troubleshooterrors.sh + bash troubleshooterrors.sh --resource-id --kube-context + ``` +- This script will generate a TroubleshootDump.log which collects detailed information about container health onboarding. +Please send this file to [AskCoin](mailto:askcoin@microsoft.com). We will respond back to you. + +If steps above did not help to resolve your issue, you can use either of the following methods to contact us for help: +* File a [GitHub Issue](https://github.com/microsoft/Docker-Provider/issues) +* Email [AskCoin](mailto:askcoin@microsoft.com) : Please attach the TroubleshootErrorDump.log in the email generated by the troubleshooting script if you had tried running the script to solve your problem. \ No newline at end of file diff --git a/scripts/troubleshoot/troubleshooterrors.sh b/scripts/troubleshoot/troubleshooterrors.sh new file mode 100644 index 000000000..ac08d7afc --- /dev/null +++ b/scripts/troubleshoot/troubleshooterrors.sh @@ -0,0 +1,485 @@ +#!/bin/bash +# +# This script troubleshoots errors related to onboarding of Azure Monitor for containers to Kubernetes cluster hosted outside and connected to Azure via Azure Arc cluster +# Prerequisites : +# Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest + +# bash troubelshooterror.sh --resource-id --kube-context + +set -e +set -o pipefail + +logFile="TroubleshootDump.log" +clusterType="connectedClusters" +extensionInstanceName="azuremonitor-containers" +# resource type for azure log analytics workspace +workspaceResourceProvider="Microsoft.OperationalInsights/workspaces" +workspaceSolutionResourceProvider="Microsoft.OperationsManagement/solutions" +agentK8sNamespace="kube-system" +azureArcK8sNamespace="azure-arc" +agentK8sSecretName="omsagent-secret" +agentK8sDeploymentName="omsagent-rs" +agentK8sLinuxDaemonsetName="omsagent" +agentArcK8sIdentityCRDName="container-insights-clusteridentityrequest" +workspaceId="" +workspacePrimarySharedKey="" +contactUSMessage="Please contact us by emailing askcoin@microsoft.com if you need any help with TroubleshootDump.log generated by this script" +dataCapHelpMessage="Please review and increase data cap https://docs.microsoft.com/en-us/azure/azure-monitor/logs/manage-cost-storage" +workspacePrivateLinkMessage="Please review this doc https://docs.microsoft.com/en-us/azure/azure-monitor/logs/private-link-security" +azureCLIInstallLinkMessage="Please install Azure-CLI as per the instructions https://docs.microsoft.com/en-us/cli/azure/install-azure-cli and rerun the troubleshooting script" +kubectlInstallLinkMessage="Please install kubectl as per the instructions https://kubernetes.io/docs/tasks/tools/#kubectl and rerun the troubleshooting script" +jqInstallLinkMessage="Please install jq as per instructions https://stedolan.github.io/jq/download/ and rerun the troubleshooting script" +ciExtensionReOnboarding="Please reinstall extension as per instructions https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-enable-arc-enabled-clusters?toc=/azure/azure-arc/kubernetes/toc.json" +timesyncHelpMessage="Please check if you have any timesync issues on your cluster nodes" + +log_message() { + echo "$@" + echo "" + echo "$@" >> $logFile +} + + +login_to_azure() { + if [ "$isUsingServicePrincipal" = true ]; then + log_message "login to the azure using provided service principal creds" + az login --service-principal --username="$servicePrincipalClientId" --password="$servicePrincipalClientSecret" --tenant="$servicePrincipalTenantId" + else + log_message "login to the azure interactively" + az login --use-device-code + fi +} + +set_azure_subscription() { + local subscriptionId="$(echo ${1})" + log_message "setting the subscription id: ${subscriptionId} as current subscription for the azure cli" + az account set -s ${subscriptionId} + log_message "successfully configured subscription id: ${subscriptionId} as current subscription for the azure cli" +} + +usage() { + local basename=$(basename $0) + echo + echo "Troubleshooting Errors related to Azure Monitor for containers:" + echo "$basename --resource-id [--kube-context ]" +} + +parse_args() { + + if [ $# -le 1 ]; then + usage + exit 1 + fi + + # Transform long options to short ones + for arg in "$@"; do + shift + case "$arg" in + "--resource-id") set -- "$@" "-r" ;; + "--kube-context") set -- "$@" "-k" ;; + "--"*) usage ;; + *) set -- "$@" "$arg" ;; + esac + done + + local OPTIND opt + + while getopts 'hk:r:' opt; do + case "$opt" in + h) + usage + ;; + + k) + kubeconfigContext="$OPTARG" + log_message "name of kube-context is $OPTARG" + ;; + + r) + clusterResourceId="$OPTARG" + log_message "clusterResourceId is $OPTARG" + ;; + + ?) + usage + exit 1 + ;; + esac + done + shift "$(($OPTIND - 1))" + + local subscriptionId="$(echo ${clusterResourceId} | cut -d'/' -f3)" + local resourceGroup="$(echo ${clusterResourceId} | cut -d'/' -f5)" + + # get resource parts and join back to get the provider name + local providerNameResourcePart1="$(echo ${clusterResourceId} | cut -d'/' -f7)" + local providerNameResourcePart2="$(echo ${clusterResourceId} | cut -d'/' -f8)" + local providerName="$(echo ${providerNameResourcePart1}/${providerNameResourcePart2})" + + local clusterName="$(echo ${clusterResourceId} | cut -d'/' -f9)" + + # convert to lowercase for validation + providerName=$(echo $providerName | tr "[:upper:]" "[:lower:]") + + log_message "cluster SubscriptionId:" $subscriptionId + log_message "cluster ResourceGroup:" $resourceGroup + log_message "cluster ProviderName:" $providerName + log_message "cluster Name:" $clusterName + + if [ -z "$subscriptionId" -o -z "$resourceGroup" -o -z "$providerName" -o -z "$clusterName" ]; then + log_message "-e invalid cluster resource id. Please try with valid fully qualified resource id of the cluster" + exit 1 + fi + + if [[ $providerName != microsoft.* ]]; then + log_message "-e invalid azure cluster resource id format." + exit 1 + fi + + # detect the resource provider from the provider name in the cluster resource id + if [ $providerName = "microsoft.kubernetes/connectedclusters" ]; then + log_message "provider cluster resource is of Azure Arc enabled Kubernetes cluster type" + isArcK8sCluster=true + resourceProvider=$arcK8sResourceProvider + else + log_message "-e not valid azure arc enabled kubernetes cluster resource id" + exit 1 + fi + + if [ -z "$kubeconfigContext" ]; then + log_message "using or getting current kube config context since --kube-context parameter not set " + fi + + if [ ! -z "$servicePrincipalClientId" -a ! -z "$servicePrincipalClientSecret" -a ! -z "$servicePrincipalTenantId" ]; then + log_message "using service principal creds (clientId, secret and tenantId) for azure login since provided" + isUsingServicePrincipal=true + fi +} + +command_exists() { + command -v "$@" > /dev/null 2>&1 +} + +validate_ci_extension() { + log_message "START:validate_ci_extension" + extension=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName) + log_message $extension + configurationSettings=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings") + if [ -z "$configurationSettings" ]; then + log_message "-e error configurationSettings either null or empty" + log_message ${contactUSMessage} + exit 1 + fi + logAnalyticsWorkspaceResourceID=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "configurationSettings.logAnalyticsWorkspaceResourceID" -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "Extension logAnalyticsWorkspaceResourceID: ${logAnalyticsWorkspaceResourceID}" + if [ -z "$logAnalyticsWorkspaceResourceID" ]; then + log_message "-e error logAnalyticsWorkspaceResourceID either null or empty in the config settings" + log_message ${contactUSMessage} + exit 1 + fi + + provisioningState=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query "provisioningState" -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "Extension provisioningState: ${provisioningState}" + if [ -z "$provisioningState" ]; then + log_message "-e error provisioningState either null or empty in the config settings" + log_message ${contactUSMessage} + exit 1 + fi + if [ "$provisioningState" != "succeeded" ]; then + log_message "-e error expected state of extension provisioningState MUST be succeeded state but actual state is ${provisioningState}" + log_message ${contactUSMessage} + exit 1 + fi + logAnalyticsWorkspaceDomain=$(az k8s-extension show -c ${4} -g ${3} -t $clusterType -n $extensionInstanceName --query 'configurationSettings."omsagent.domain"') + log_message "Extension logAnalyticsWorkspaceDomain: ${logAnalyticsWorkspaceDomain}" + if [ -z "$logAnalyticsWorkspaceDomain" ]; then + log_message "-e error logAnalyticsWorkspaceDomain either null or empty in the config settings" + log_message ${contactUSMessage} + exit 1 + fi + azureCloudName=${1} + if [ "$azureCloudName" = "azureusgovernment" ]; then + log_message "az cli configured cloud name:$azureCloudName" + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.us" ]; then + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.us but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} + exit 1 + fi + elif [ "$azureCloudName" = "azurecloud" ]; then + log_message "az cli configured cloud name:$azureCloudName" + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.com" ]; then + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.com but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} + exit 1 + fi + elif [ "$azureCloudName" = "azurechinacloud" ]; then + log_message "az cli configured cloud name:$azureCloudName" + if [ $logAnalyticsWorkspaceDomain = "opinsights.azure.cn" ]; then + log_message "-e error expected value of logAnalyticsWorkspaceDomain MUST opinsights.azure.cn but actual value is ${logAnalyticsWorkspaceDomain}" + log_message ${contactUSMessage} + exit 1 + fi + fi + + workspaceSubscriptionId="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" + workspaceResourceGroup="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f5)" + workspaceName="$(echo ${logAnalyticsWorkspaceResourceID} | cut -d'/' -f9)" + log_message "workspaceSubscriptionId:${workspaceSubscriptionId} workspaceResourceGroup:${workspaceResourceGroup} workspaceName:${workspaceName}" + + clusterSubscriptionId=${2} + # set the azure subscription to azure cli if the workspace in different sub than cluster + if [[ "$clusterSubscriptionId" != "$workspaceSubscriptionId" ]]; then + log_message "switch subscription id of workspace as active subscription for azure cli since workspace in different subscription than cluster: ${workspaceSubscriptionId}" + isClusterAndWorkspaceInSameSubscription=false + set_azure_subscription $workspaceSubscriptionId + fi + workspaceList=$(az resource list -g "$workspaceResourceGroup" -n "$workspaceName" --resource-type $workspaceResourceProvider) + log_message "workspace info:${workspaceList}" + if [ "$workspaceList" = "[]" ]; then + log_message "-e error workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" + exit 1 + fi + + ciSolutionResourceName="ContainerInsights(${workspaceName})" + workspaceSolutionList=$(az resource list -g $workspaceResourceGroup -n $ciSolutionResourceName --resource-type $workspaceSolutionResourceProvider) + log_message "workspace solution info:${workspaceSolutionList}" + if [ "$workspaceSolutionList" = "[]" ]; then + log_message "-e error ContainerInsights solution on workspace:${logAnalyticsWorkspaceResourceID} doesnt exist" + exit 1 + fi + + privateLinkScopedResources=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.privateLinkScopedResources -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "workspace privateLinkScopedResources:${privateLinkScopedResources}" + + publicNetworkAccessForIngestion=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForIngestion -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "workspace publicNetworkAccessForIngestion:${publicNetworkAccessForIngestion}" + if [ -z "$privateLinkScopedResources" ]; then + if [ "$publicNetworkAccessForIngestion" != "enabled" ]; then + log_message "-e error Unless private link configuration, publicNetworkAccessForIngestion MUST be enabled for data ingestion" + log_message ${workspacePrivateLinkMessage} + exit 1 + fi + fi + publicNetworkAccessForQuery=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.publicNetworkAccessForQuery -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") + log_message "workspace publicNetworkAccessForQuery:${publicNetworkAccessForQuery}" + if [ -z "$privateLinkScopedResources" ]; then + if [ "$publicNetworkAccessForQuery" != "enabled" ]; then + log_message "-e error Unless private link configuration, publicNetworkAccessForQuery MUST be enabled for data query" + log_message ${workspacePrivateLinkMessage} + exit 1 + fi + fi + + workspaceCappingDailyQuotaGb=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.workspaceCapping.dailyQuotaGb -o tsv | tr -d "[:space:]") + log_message "workspaceCapping dailyQuotaGb:${workspaceCappingDailyQuotaGb}" + if [ "$workspaceCappingDailyQuotaGb" != "-1.0" ]; then + log_message "-e error workspace configured daily quota and verify ingestion data reaching over the quota:${workspaceCappingDailyQuotaGb}" + log_message ${dataCapHelpMessage} + exit 1 + fi + + workspaceId=$(az resource show --ids ${logAnalyticsWorkspaceResourceID} --query properties.customerId -o tsv | tr -d "[:space:]") + log_message "workspaceId: ${workspaceId}" + + workspaceKey=$(az rest --method post --uri $logAnalyticsWorkspaceResourceID/sharedKeys?api-version=2015-11-01-preview --query primarySharedKey -o json) + workspacePrimarySharedKey=$(echo $workspaceKey | tr -d '"') + + log_message "END:validate_ci_extension:SUCCESS" +} + +validate_az_cli_installed_or_not() { + if command_exists az; then + log_message "detected azure cli installed" + azCLIVersion=$(az -v) + log_message "azure-cli version: ${azCLIVersion}" + azCLIExtension=$(az extension list --query "[?name=='k8s-extension'].name | [0]") + if [ "$azCLIExtension" = "k8s-extension" ]; then + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + log_message "detected k8s-extension and current installed version: ${azCLIExtensionVersion}" + log_message "updating the k8s-extension version to latest available one" + az extension update --name 'k8s-extension' + else + log_message "adding k8s-extension since k8s-extension doesnt exist as installed" + az extension add --name 'k8s-extension' + fi + azCLIExtensionVersion=$(az extension list --query "[?name=='k8s-extension'].version | [0]") + log_message "current installed k8s-extension version: ${azCLIExtensionVersion}" + else + log_message "-e error azure cli doesnt exist as installed" + log_message ${azureCLIInstallLinkMessage} + exit 1 + fi +} + +validate_ci_agent_pods() { + log_message "START:validate_ci_agent_pods" + # verify the id and key of the workspace matches with workspace key value in the secret + wsID=$(kubectl get secrets ${agentK8sSecretName} -n ${agentK8sNamespace} -o json | jq -r ".data.WSID") + wsID=$(echo $wsID | base64 -d) + log_message "workspaceId: ${wsID} value in the ${agentK8sSecretName}" + + wsKEY=$(kubectl get secrets ${agentK8sSecretName} -n ${agentK8sNamespace} -o json | jq -r ".data.KEY") + wsKEY=$(echo $wsKEY | base64 -d) + + if [[ "$workspaceId" != "$wsID" ]]; then + log_message "-e error workspaceId: ${workspaceID} of the workspace doesnt match with workspaceId: ${wsID} value in the omsagent secret" + log_message $ciExtensionReOnboarding + exit 1 + fi + if [[ "$workspacePrimarySharedKey" != "$wsKEY" ]]; then + log_message "-e error workspacePrimarySharedKey of the workspace doesnt match with workspacekey value value in the omsagent secret" + log_message $ciExtensionReOnboarding + exit 1 + fi + + # verify state of agent deployment + readyReplicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.readyReplicas') + log_message "number of deployment ready replicas:${readyReplicas}" + if [[ "$readyReplicas" != "1" ]]; then + log_message "-e error number of readyReplicas of agent deployment MUST be 1" + exit 1 + fi + replicas=$(kubectl get deployments -n ${agentK8sNamespace} ${agentK8sDeploymentName} -o json | jq '.status.replicas') + log_message "number of deployment replicas:${replicas}" + if [[ "$replicas" != "1" ]]; then + log_message "-e error number of replicas of agent deployment MUST be 1" + exit 1 + fi + + # verify state of agent ds + currentNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.currentNumberScheduled') + desiredNumberScheduled=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.desiredNumberScheduled') + log_message "number of linux deamonset pods currentNumberScheduled:${currentNumberScheduled} and currentNumberScheduled:${currentNumberScheduled}" + if [[ "$currentNumberScheduled" != "$desiredNumberScheduled" ]]; then + log_message "-e error desiredNumberScheduled: ${desiredNumberScheduled} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + + numberAvailable=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberAvailable') + log_message "number of linux deamonset pods numberAvailable:${numberAvailable}" + if [[ "$numberAvailable" != "$currentNumberScheduled" ]]; then + log_message "-e error numberAvailable: ${numberAvailable} doesnt match with currentNumberScheduled: ${currentNumberScheduled}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + numberReady=$(kubectl get ds -n ${agentK8sNamespace} ${agentK8sLinuxDaemonsetName} -o json | jq '.status.numberReady') + log_message "number of linux deamonset pods numberReady:${numberReady}" + if [[ "$numberAvailable" != "$numberReady" ]]; then + log_message "-e error numberAvailable: ${numberAvailable} doesnt match with numberReady: ${numberReady}" + log_message "-e error please fix the pod scheduling issues of omsagent daemonset pods in namespace: ${agentK8sNamespace}" + exit 1 + fi + log_message "END:validate_ci_agent_pods:SUCCESS" +} + +validate_ci_agent_identity_status() { + log_message "START:validate_ci_agent_identity_status" + log_message "Info of ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json >> $logFile + status=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status') + if [ -z "$status" ]; then + log_message "-e error status field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + expirationTime=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.expirationTime') + if [ -z "$expirationTime" ]; then + log_message "-e error expirationTime field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + tokenReference=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference') + if [ -z "$tokenReference" ]; then + log_message "-e error tokenReference field empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + dataName=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference.dataName') + if [ -z "$dataName" ]; then + log_message "-e error dataName field of tokenReference empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + secretName=$(kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} ${agentArcK8sIdentityCRDName} -o json | jq -r '.status.tokenReference.secretName') + if [ -z "$secretName" ]; then + log_message "-e error secretName field of tokenReference empty for the CRD ${agentArcK8sIdentityCRDName} in namespace ${azureArcK8sNamespace}" + log_message $timesyncHelpMessage + exit 1 + fi + log_message "END:validate_ci_agent_identity_status:SUCCESS" +} + +get_nodes_pods_crds_info() { + log_message "START:get_nodes_pods_crds_info" + log_message "nodes" + kubectl get nodes >> $logFile + + log_message "kube-system pods" + kubectl get pods -n ${agentK8sNamespace} >> $logFile + + log_message "azurearck8spods" + kubectl get pods -n ${azureArcK8sNamespace} >> $logFile + + log_message "crds" + kubectl get crds -A >> $logFile + + log_message "azureclusteridentityrequests crds" + kubectl get crds azureclusteridentityrequests.clusterconfig.azure.com >> $logFile + kubectl get azureclusteridentityrequests -n ${azureArcK8sNamespace} >> $logFile + + log_message "container-insights-clusteridentityrequest crd" + kubectl describe azureclusteridentityrequests -n ${azureArcK8sNamespace} container-insights-clusteridentityrequest >> $logFile + log_message "END:get_nodes_pods_crds_info:SUCCESS" +} + +datetime=$(date -u) +log_message "*** Script Execution start @ ${datetime} ***" + +# verify azure cli installed or not +validate_az_cli_installed_or_not + +# parse and validate args +parse_args $@ + +# parse cluster resource id +clusterSubscriptionId="$(echo $clusterResourceId | cut -d'/' -f3 | tr "[:upper:]" "[:lower:]")" +clusterResourceGroup="$(echo $clusterResourceId | cut -d'/' -f5)" +providerName="$(echo $clusterResourceId | cut -d'/' -f7)" +clusterName="$(echo $clusterResourceId | cut -d'/' -f9)" + +# get the current active azure cloud of the az cli +azureCloudName=$(az cloud show --query name -o tsv | tr "[:upper:]" "[:lower:]" | tr -d "[:space:]") +log_message "azure cloud name: ${azureCloudName}" + +# login to azure interactively +login_to_azure + +# set the cluster subscription id as active sub for azure cli +set_azure_subscription $clusterSubscriptionId + +# validate ci extension +validate_ci_extension $azureCloudName $clusterSubscriptionId $clusterResourceGroup $clusterName + +# validate ci agent pods +if command_exists kubectl; then + if command_exists jq; then + validate_ci_agent_pods + else + log_message "-e error jq doesnt exist as installed" + log_message $jqInstallLinkMessage + exit 1 + fi +else + log_message "-e error kubectl doesnt exist as installed" + log_message ${kubectlInstallLinkMessage} + exit 1 +fi + +# validate ci cluster identity token +validate_ci_agent_identity_status + +# get nodes and pods status +get_nodes_pods_crds_info + +log_message "Everything looks good according to this script." +log_message $contactUSMessage From 7c9cdc819eddf828b140b274cdeeb0121661a656 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:21:17 -0700 Subject: [PATCH 179/194] Sarah/remove cdpx creds (#685) * remove download of cdpx creds --- .pipelines/get-aad-app-creds-from-kv.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.pipelines/get-aad-app-creds-from-kv.sh b/.pipelines/get-aad-app-creds-from-kv.sh index a0ba464cc..8ef56cddb 100755 --- a/.pipelines/get-aad-app-creds-from-kv.sh +++ b/.pipelines/get-aad-app-creds-from-kv.sh @@ -11,8 +11,6 @@ do KV) KV=$VALUE ;; KVSECRETNAMEAPPID) AppId=$VALUE ;; KVSECRETNAMEAPPSECRET) AppSecret=$VALUE ;; - KVSECRETNAMECDPXAPPID) CdpxAppId=$VALUE ;; - KVSECRETNAMECDPXAPPSECRET) CdpxAppSecret=$VALUE ;; *) esac done @@ -29,16 +27,4 @@ az keyvault secret download --file ~/acrappsecret --vault-name ${KV} --name ${A echo "downloaded the appsecret from KV:${KV} and KV secret:${AppSecret}" -echo "key vault secret name for cdpx appid:${KVSECRETNAMECDPXAPPID}" - -echo "key vault secret name for cdpx appsecret:${KVSECRETNAMECDPXAPPSECRET}" - -az keyvault secret download --file ~/cdpxacrappid --vault-name ${KV} --name ${CdpxAppId} - -echo "downloaded the appid from KV:${KV} and KV secret:${CdpxAppId}" - -az keyvault secret download --file ~/cdpxacrappsecret --vault-name ${KV} --name ${CdpxAppSecret} - -echo "downloaded the appsecret from KV:${KV} and KV secret:${CdpxAppSecret}" - echo "end: get app id and secret from specified key vault" From f75eea66327776075c1094c636404e5ee3bdfa95 Mon Sep 17 00:00:00 2001 From: bragi92 Date: Fri, 5 Nov 2021 10:14:59 -0700 Subject: [PATCH 180/194] fix: subtract number instead of string + update fluentd version 1.14.2 to fix security vulnerability (#686) * fix: change default value to a number so that substraction happens correctly * update fluentd version to 1.14.2 * extra end statement * safely set to float * big decimal precision * revert omsagent * keep telemetry --- .github/workflows/run_unit_tests.yml | 2 +- build/common/installer/scripts/tomlparser.rb | 2 - kubernetes/linux/setup.sh | 2 +- kubernetes/omsagent.yaml | 1 + kubernetes/windows/Dockerfile | 2 +- kubernetes/windows/Dockerfile-dev-base-image | 2 +- source/plugins/ruby/kubelet_utils.rb | 41 ++++++++++++-------- 7 files changed, 29 insertions(+), 23 deletions(-) diff --git a/.github/workflows/run_unit_tests.yml b/.github/workflows/run_unit_tests.yml index 94ac4371a..435de91e8 100644 --- a/.github/workflows/run_unit_tests.yml +++ b/.github/workflows/run_unit_tests.yml @@ -26,7 +26,7 @@ jobs: uses: actions/checkout@v2 - name: install fluent run: | - sudo gem install fluentd -v "1.12.2" --no-document + sudo gem install fluentd -v "1.14.2" --no-document sudo fluentd --setup ./fluent - name: Run unit tests run: | diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 32ea09aa3..03b470205 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -191,8 +191,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for adx database name - #{errorStr}, using default #{@adxDatabaseName}, please check config map for errors") end - - end end end diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 243677dd0..5bddfc604 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -52,7 +52,7 @@ sudo echo "deb http://ppa.launchpad.net/brightbox/ruby-ng/ubuntu bionic main" >> sudo apt-get update sudo apt-get install ruby2.6 ruby2.6-dev gcc make -y # fluentd v1 gem -gem install fluentd -v "1.12.2" --no-document +gem install fluentd -v "1.14.2" --no-document fluentd --setup ./fluent gem install gyoku iso8601 --no-doc diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 66f8c4010..a1a843196 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -927,3 +927,4 @@ spec: names: plural: healthstates kind: HealthState + \ No newline at end of file diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 0ddf67ab2..41ad7e7ba 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -20,7 +20,7 @@ RUN refreshenv \ && gem install cool.io -v 1.5.4 --platform ruby \ && gem install oj -v 3.3.10 \ && gem install json -v 2.2.0 \ -&& gem install fluentd -v 1.12.2 \ +&& gem install fluentd -v 1.14.2 \ && gem install win32-service -v 1.0.1 \ && gem install win32-ipc -v 0.7.0 \ && gem install win32-event -v 0.6.3 \ diff --git a/kubernetes/windows/Dockerfile-dev-base-image b/kubernetes/windows/Dockerfile-dev-base-image index 0081f9c53..501fead89 100644 --- a/kubernetes/windows/Dockerfile-dev-base-image +++ b/kubernetes/windows/Dockerfile-dev-base-image @@ -18,7 +18,7 @@ RUN refreshenv \ && gem install cool.io -v 1.5.4 --platform ruby \ && gem install oj -v 3.3.10 \ && gem install json -v 2.2.0 \ -&& gem install fluentd -v 1.12.2 \ +&& gem install fluentd -v 1.14.2 \ && gem install win32-service -v 1.0.1 \ && gem install win32-ipc -v 0.7.0 \ && gem install win32-event -v 0.6.3 \ diff --git a/source/plugins/ruby/kubelet_utils.rb b/source/plugins/ruby/kubelet_utils.rb index e31407b54..368ca8639 100644 --- a/source/plugins/ruby/kubelet_utils.rb +++ b/source/plugins/ruby/kubelet_utils.rb @@ -47,6 +47,9 @@ def get_node_allocatable(cpu_capacity, memory_capacity) @log.error "kubelet_utils.rb::get_node_allocatble - cpu_capacity or memory_capacity values not set. Hence we cannot calculate allocatable values" end + cpu_capacity = BigDecimal(cpu_capacity, 2).to_f + memory_capacity = BigDecimal(memory_capacity, 2).to_f + cpu_allocatable = 1.0 memory_allocatable = 1.0 @@ -56,74 +59,74 @@ def get_node_allocatable(cpu_capacity, memory_capacity) begin kubereserved_cpu = parsed_response["kubeletconfig"]["kubeReserved"]["cpu"] if kubereserved_cpu.nil? || kubereserved_cpu == "" - kubereserved_cpu = "0" + kubereserved_cpu = "0.0" end @log.info "get_node_allocatable::kubereserved_cpu #{kubereserved_cpu}" rescue => errorStr @log.error "Error in get_node_allocatable::kubereserved_cpu: #{errorStr}" - kubereserved_cpu = "0" + kubereserved_cpu = "0.0" ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") end begin kubereserved_memory = parsed_response["kubeletconfig"]["kubeReserved"]["memory"] if kubereserved_memory.nil? || kubereserved_memory == "" - kubereserved_memory = "0" + kubereserved_memory = "0.0" end @log.info "get_node_allocatable::kubereserved_memory #{kubereserved_memory}" rescue => errorStr @log.error "Error in get_node_allocatable::kubereserved_memory: #{errorStr}" - kubereserved_memory = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + kubereserved_memory = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_memory: #{errorStr}") end begin systemReserved_cpu = parsed_response["kubeletconfig"]["systemReserved"]["cpu"] if systemReserved_cpu.nil? || systemReserved_cpu == "" - systemReserved_cpu = "0" + systemReserved_cpu = "0.0" end @log.info "get_node_allocatable::systemReserved_cpu #{systemReserved_cpu}" rescue => errorStr # this will likely always reach this condition for AKS ~ only applicable for hyrid + MDM combination @log.error "Error in get_node_allocatable::systemReserved_cpu: #{errorStr}" - systemReserved_cpu = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + systemReserved_cpu = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::systemReserved_cpu: #{errorStr}") end begin explicitlyReserved_cpu = parsed_response["kubeletconfig"]["reservedCPUs"] if explicitlyReserved_cpu.nil? || explicitlyReserved_cpu == "" - explicitlyReserved_cpu = "0" + explicitlyReserved_cpu = "0.0" end @log.info "get_node_allocatable::explicitlyReserved_cpu #{explicitlyReserved_cpu}" rescue => errorStr # this will likely always reach this condition for AKS ~ only applicable for hyrid + MDM combination @log.error "Error in get_node_allocatable::explicitlyReserved_cpu: #{errorStr}" - explicitlyReserved_cpu = "0" + explicitlyReserved_cpu = "0.0" ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::explicitlyReserved_cpu: #{errorStr}") end begin systemReserved_memory = parsed_response["kubeletconfig"]["systemReserved"]["memory"] if systemReserved_memory.nil? || systemReserved_memory == "" - systemReserved_memory = "0" + systemReserved_memory = "0.0" end @log.info "get_node_allocatable::systemReserved_memory #{systemReserved_memory}" rescue => errorStr @log.error "Error in get_node_allocatable::systemReserved_memory: #{errorStr}" - systemReserved_memory = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + systemReserved_memory = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::systemReserved_memory: #{errorStr}") end begin evictionHard_memory = parsed_response["kubeletconfig"]["evictionHard"]["memory.available"] if evictionHard_memory.nil? || evictionHard_memory == "" - evictionHard_memory = "0" + evictionHard_memory = "0.0" end @log.info "get_node_allocatable::evictionHard_memory #{evictionHard_memory}" rescue => errorStr @log.error "Error in get_node_allocatable::evictionHard_memory: #{errorStr}" - evictionHard_memory = "0" - ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::kubereserved_cpu: #{errorStr}") + evictionHard_memory = "0.0" + ApplicationInsightsUtility.sendExceptionTelemetry("Error in get_node_allocatable::evictionHard_memory: #{errorStr}") end # do calculation in nanocore since that's what KubernetesApiClient.getMetricNumericValue expects @@ -137,9 +140,13 @@ def get_node_allocatable(cpu_capacity, memory_capacity) end # convert back to units similar to what we get for capacity cpu_allocatable = cpu_allocatable / (1000.0 ** 2) - @log.info "CPU Allocatable #{cpu_allocatable}" memory_allocatable = memory_capacity - (KubernetesApiClient.getMetricNumericValue("memory", kubereserved_memory) + KubernetesApiClient.getMetricNumericValue("memory", systemReserved_memory) + KubernetesApiClient.getMetricNumericValue("memory", evictionHard_memory)) + + cpu_allocatable = BigDecimal(cpu_allocatable, 2).to_f + memory_allocatable = BigDecimal(memory_allocatable, 2).to_f + + @log.info "CPU Allocatable #{cpu_allocatable}" @log.info "Memory Allocatable #{memory_allocatable}" return [cpu_allocatable, memory_allocatable] From 15ee6c53337bec218cabf02aa3234f6b1e0ea412 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Fri, 5 Nov 2021 11:28:21 -0700 Subject: [PATCH 181/194] Faster Linux builds (part 1) (#687) * moved docker image arg later on to enable docker build caching * fixing image tag (doh) --- kubernetes/linux/Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 90acb4959..9164abc9c 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,8 +2,6 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod10132021 -ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi ENV MALLOC_ARENA_MAX 2 @@ -18,6 +16,10 @@ ENV KUBE_CLIENT_BACKOFF_DURATION 0 ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ + +ARG IMAGE_TAG=ciprod10132021 +ENV AGENT_VERSION ${IMAGE_TAG} + WORKDIR ${tmpdir} # copy docker provider shell bundle to use the agent image @@ -27,3 +29,4 @@ COPY ./Linux_ULINUX_1.0_x64_64_Release/docker-cimprov-*.*.*-*.x86_64.sh . RUN chmod 775 $tmpdir/*.sh; sync; $tmpdir/setup.sh CMD [ "/opt/main.sh" ] + From b4ca054e30a7271e28b5fd38f6cdeaaf9ebfe370 Mon Sep 17 00:00:00 2001 From: sarahpeiffer <46665092+sarahpeiffer@users.noreply.github.com> Date: Thu, 13 Jan 2022 13:25:53 -0800 Subject: [PATCH 182/194] Sarah/fluentbit windows log (#688) * upgrade fluentbit version for windows * saving progress--fluent bit log tailing working for windows * use configmap values for fluent-bit.conf where necessary and make necessary files common * revert certificategenerator * remove tomlparser-agent-config from linux folder * clean up fluent.conf * clean up fluent-bit.conf * revert image tag * fix agent tag * make fluent bit flush interval configurable * clean up unecessary conf files * remove unecessary parts of fluent and fluent-bit conf * log level back to info * add fbit env variables for omsagent-win * moving db files to var directory --- .../installer/conf/azm-containers-parser.conf | 0 .../scripts/td-agent-bit-conf-customizer.rb | 5 ++ .../scripts/tomlparser-agent-config.rb | 34 ++++++++++ build/common/installer/scripts/tomlparser.rb | 2 + .../installer/datafiles/base_container.data | 4 +- build/windows/installer/conf/fluent-bit.conf | 63 ++++++++++++----- .../installer/conf/fluent-cri-parser.conf | 6 -- .../installer/conf/fluent-docker-parser.conf | 5 -- build/windows/installer/conf/fluent.conf | 68 ------------------- kubernetes/linux/setup.sh | 2 +- kubernetes/omsagent.yaml | 6 ++ kubernetes/windows/Dockerfile | 4 +- kubernetes/windows/Dockerfile-dev-image | 4 +- kubernetes/windows/main.ps1 | 15 ++-- kubernetes/windows/setup.ps1 | 7 +- 15 files changed, 110 insertions(+), 115 deletions(-) rename build/{linux => common}/installer/conf/azm-containers-parser.conf (100%) rename build/{linux => common}/installer/scripts/tomlparser-agent-config.rb (87%) delete mode 100644 build/windows/installer/conf/fluent-cri-parser.conf delete mode 100644 build/windows/installer/conf/fluent-docker-parser.conf diff --git a/build/linux/installer/conf/azm-containers-parser.conf b/build/common/installer/conf/azm-containers-parser.conf similarity index 100% rename from build/linux/installer/conf/azm-containers-parser.conf rename to build/common/installer/conf/azm-containers-parser.conf diff --git a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb index f29c87407..995d72b87 100644 --- a/build/common/installer/scripts/td-agent-bit-conf-customizer.rb +++ b/build/common/installer/scripts/td-agent-bit-conf-customizer.rb @@ -3,6 +3,11 @@ @td_agent_bit_conf_path = "/etc/opt/microsoft/docker-cimprov/td-agent-bit.conf" +@os_type = ENV["OS_TYPE"] +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + @td_agent_bit_conf_path = "/etc/fluent-bit/fluent-bit.conf" +end + @default_service_interval = "15" @default_mem_buf_limit = "10" diff --git a/build/linux/installer/scripts/tomlparser-agent-config.rb b/build/common/installer/scripts/tomlparser-agent-config.rb similarity index 87% rename from build/linux/installer/scripts/tomlparser-agent-config.rb rename to build/common/installer/scripts/tomlparser-agent-config.rb index 4daaf6a0c..052bb5a5d 100644 --- a/build/linux/installer/scripts/tomlparser-agent-config.rb +++ b/build/common/installer/scripts/tomlparser-agent-config.rb @@ -228,3 +228,37 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "Exception while opening file for writing config environment variables" puts "****************End Config Processing********************" end + +def get_command_windows(env_variable_name, env_variable_value) + return "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Process\")" + "\n" + "[System.Environment]::SetEnvironmentVariable(\"#{env_variable_name}\", \"#{env_variable_value}\", \"Machine\")" + "\n" +end + +if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 + # Write the settings to file, so that they can be set as environment variables + file = File.open("setagentenv.ps1", "w") + + if !file.nil? + if @fbitFlushIntervalSecs > 0 + commands = get_command_windows('FBIT_SERVICE_FLUSH_INTERVAL', @fbitFlushIntervalSecs) + file.write(commands) + end + if @fbitTailBufferChunkSizeMBs > 0 + commands = get_command_windows('FBIT_TAIL_BUFFER_CHUNK_SIZE', @fbitTailBufferChunkSizeMBs) + file.write(commands) + end + if @fbitTailBufferMaxSizeMBs > 0 + commands = get_command_windows('FBIT_TAIL_BUFFER_MAX_SIZE', @fbitTailBufferMaxSizeMBs) + file.write(commands) + end + if @fbitTailMemBufLimitMBs > 0 + commands = get_command_windows('FBIT_TAIL_MEM_BUF_LIMIT', @fbitTailMemBufLimitMBs) + file.write(commands) + end + # Close file after writing all environment variables + file.close + puts "****************End Config Processing********************" + else + puts "Exception while opening file for writing config environment variables for WINDOWS LOG" + puts "****************End Config Processing********************" + end +end \ No newline at end of file diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 03b470205..64d6d48fb 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -29,6 +29,8 @@ @adxDatabaseName = "containerinsights" # default for all configurations if !@os_type.nil? && !@os_type.empty? && @os_type.strip.casecmp("windows") == 0 @containerLogsRoute = "v1" # default is v1 for windows until windows agent integrates windows ama + # This path format is necessary for fluent-bit in windows + @logTailPath = "C:\\var\\log\\containers\\*.log" end # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index d104a5084..985c73a17 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -34,7 +34,7 @@ MAINTAINER: 'Microsoft Corporation' /etc/opt/microsoft/docker-cimprov/td-agent-bit.conf; build/linux/installer/conf/td-agent-bit.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf; build/linux/installer/conf/td-agent-bit-prom-side-car.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/td-agent-bit-rs.conf; build/linux/installer/conf/td-agent-bit-rs.conf; 644; root; root -/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/linux/installer/conf/azm-containers-parser.conf; 644; root; root +/etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf; build/common/installer/conf/azm-containers-parser.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/out_oms.conf; build/linux/installer/conf/out_oms.conf; 644; root; root /etc/opt/microsoft/docker-cimprov/test.json; build/linux/installer/conf/test.json; 644; root; root /etc/opt/microsoft/docker-cimprov/telegraf.conf; build/linux/installer/conf/telegraf.conf; 644; root; root @@ -48,7 +48,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root -/opt/tomlparser-agent-config.rb; build/linux/installer/scripts/tomlparser-agent-config.rb; 755; root; root +/opt/tomlparser-agent-config.rb; build/common/installer/scripts/tomlparser-agent-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root /opt/td-agent-bit-conf-customizer.rb; build/common/installer/scripts/td-agent-bit-conf-customizer.rb; 755; root; root /opt/ConfigParseErrorLogger.rb; build/common/installer/scripts/ConfigParseErrorLogger.rb; 755; root; root diff --git a/build/windows/installer/conf/fluent-bit.conf b/build/windows/installer/conf/fluent-bit.conf index 1eebe5fd6..243056505 100644 --- a/build/windows/installer/conf/fluent-bit.conf +++ b/build/windows/installer/conf/fluent-bit.conf @@ -1,25 +1,54 @@ [SERVICE] - Flush 15 - Daemon Off - Log_Level info - Log_File /etc/fluent-bit/fluent-bit.log + #Default service flush interval is 15 seconds + ${SERVICE_FLUSH_INTERVAL} + Daemon Off + storage.path /etc/fluent-bit/flbstore/ + storage.sync normal + storage.checksum off + storage.backlog.mem_limit 10M + Log_Level info + Parsers_File /etc/fluent-bit/azm-containers-parser.conf + Log_File /etc/fluent-bit/fluent-bit.log [INPUT] - Name forward - Listen 127.0.0.1 - Port 25230 - Mem_Buf_Limit 10m - Chunk_Size 32 - Buffer_Size 64 + Name tail + Tag oms.container.log.la.* + Path ${AZMON_LOG_TAIL_PATH} + Read_from_Head true + DB C:\\var\\log\\omsagent-fblogs.db + DB.Sync Off + Parser docker + ${TAIL_MEM_BUF_LIMIT} + ${TAIL_BUFFER_CHUNK_SIZE} + ${TAIL_BUFFER_MAX_SIZE} + Rotate_Wait 20 + Refresh_Interval 30 + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 5m + Exclude_Path ${AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH} [INPUT] - Name tcp - Tag oms.container.perf.telegraf.* - Listen 0.0.0.0 - Port 25229 - Chunk_Size 32 - Buffer_Size 64 - Mem_Buf_Limit 5m + Name tail + Tag oms.container.log.flbplugin.* + Path C:\\var\\log\\containers\\omsagent*.log + Read_from_Head true + DB C:\\var\\log\\omsagent-fluentbit-containers.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +[INPUT] + Name tcp + Tag oms.container.perf.telegraf.* + Listen 0.0.0.0 + Port 25229 + Chunk_Size 32 + Buffer_Size 64 + Mem_Buf_Limit 5m [OUTPUT] Name oms diff --git a/build/windows/installer/conf/fluent-cri-parser.conf b/build/windows/installer/conf/fluent-cri-parser.conf deleted file mode 100644 index 86f1572ca..000000000 --- a/build/windows/installer/conf/fluent-cri-parser.conf +++ /dev/null @@ -1,6 +0,0 @@ - - @type regexp - expression ^(? diff --git a/build/windows/installer/conf/fluent-docker-parser.conf b/build/windows/installer/conf/fluent-docker-parser.conf deleted file mode 100644 index 9dc800aeb..000000000 --- a/build/windows/installer/conf/fluent-docker-parser.conf +++ /dev/null @@ -1,5 +0,0 @@ - - @type json - time_format %Y-%m-%dT%H:%M:%S.%NZ - keep_time_key true - diff --git a/build/windows/installer/conf/fluent.conf b/build/windows/installer/conf/fluent.conf index 741e5ce19..a78ac58fa 100644 --- a/build/windows/installer/conf/fluent.conf +++ b/build/windows/installer/conf/fluent.conf @@ -11,31 +11,6 @@ @log_level debug - - @type tail - path "#{ENV['AZMON_LOG_TAIL_PATH']}" - exclude_path "#{ENV['AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH']}" - pos_file /var/opt/microsoft/fluent/fluentd-containers.log.pos - tag oms.container.log.la - @log_level trace - path_key tailed_path - limit_recently_modified 5m - # if the container runtime is non docker then this will be updated to fluent-cri-parser.conf during container startup - @include fluent-docker-parser.conf - - - - @type tail - path /var/log/containers/omsagent*.log - pos_file /opt/microsoft/fluent/omsagent-fluentd-containers.log.pos - tag oms.container.log.flbplugin - @log_level trace - path_key tailed_path - read_from_head true - # if the container runtime is non docker then this will be updated to fluent-cri-parser.conf during container startup - @include fluent-docker-parser.conf - - #custom_metrics_mdm filter plugin @type cadvisor2mdm @@ -44,23 +19,6 @@ @log_level info - - @type grep - - key stream - pattern "#{ENV['AZMON_LOG_EXCLUSION_REGEX_PATTERN']}" - - - - - @type record_transformer - # fluent-plugin-record-modifier more light-weight but needs to be installed (dependency worth it?) - remove_keys tailed_path - - filepath ${record["tailed_path"]} - - - @type mdm @log_level debug @@ -77,29 +35,3 @@ retry_mdm_post_wait_minutes 30 - - - @type forward - send_timeout 60s - recover_wait 10s - hard_timeout 60s - heartbeat_type none - ignore_network_errors_at_startup true - - name logaggregationserver - host 127.0.0.1 - port 25230 - weight 60 - - - - overflow_action throw_exception - chunk_limit_size 32k - queued_chunks_limit_size 256 - flush_interval 1 - flush_thread_interval 0.5 - flush_thread_burst_interval 0.01 - flush_thread_count 4 - retry_forever true - - diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 5bddfc604..80a1b5b1d 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -44,7 +44,7 @@ chmod 777 /opt/telegraf wget -qO - https://packages.fluentbit.io/fluentbit.key | sudo apt-key add - sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/apt/sources.list sudo apt-get update -sudo apt-get install td-agent-bit=1.6.8 -y +sudo apt-get install td-agent-bit=1.7.8 -y # install ruby2.6 sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F5DA5F09C3173AA6 diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index a1a843196..152f2313b 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -783,6 +783,12 @@ spec: cpu: 200m memory: 600Mi env: + - name: FBIT_SERVICE_FLUSH_INTERVAL + value: "15" + - name: FBIT_TAIL_BUFFER_CHUNK_SIZE + value: "1" + - name: FBIT_TAIL_BUFFER_MAX_SIZE + value: "1" # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID_VALUE" diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 41ad7e7ba..671a89246 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -57,10 +57,8 @@ COPY ./omsagentwindows/out_oms.so /opt/omsagentwindows/out_oms.so # copy fluent, fluent-bit and out_oms conf files COPY ./omsagentwindows/installer/conf/fluent.conf /etc/fluent/ -# copy fluent docker and cri parser conf files -COPY ./omsagentwindows/installer/conf/fluent-cri-parser.conf /etc/fluent/ -COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit +COPY ./omsagentwindows/installer/conf/azm-containers-parser.conf /etc/fluent-bit/ COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows # copy telegraf conf file diff --git a/kubernetes/windows/Dockerfile-dev-image b/kubernetes/windows/Dockerfile-dev-image index 35aa83bd9..c38889f7b 100644 --- a/kubernetes/windows/Dockerfile-dev-image +++ b/kubernetes/windows/Dockerfile-dev-image @@ -19,10 +19,8 @@ COPY ./omsagentwindows/out_oms.so /opt/omsagentwindows/out_oms.so # copy fluent, fluent-bit and out_oms conf files COPY ./omsagentwindows/installer/conf/fluent.conf /etc/fluent/ -# copy fluent docker and cri parser conf files -COPY ./omsagentwindows/installer/conf/fluent-cri-parser.conf /etc/fluent/ -COPY ./omsagentwindows/installer/conf/fluent-docker-parser.conf /etc/fluent/ COPY ./omsagentwindows/installer/conf/fluent-bit.conf /etc/fluent-bit +COPY ./omsagentwindows/installer/conf/azm-containers-parser.conf /etc/fluent-bit/ COPY ./omsagentwindows/installer/conf/out_oms.conf /etc/omsagentwindows # copy telegraf conf file diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 3cbc11e20..733ddb408 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -297,6 +297,13 @@ function Set-EnvironmentVariables { # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb .\setenv.ps1 + + #Parse the configmap to set the right environment variables for agent config. + ruby /opt/omsagentwindows/scripts/ruby/tomlparser-agent-config.rb + .\setagentenv.ps1 + + #Replace placeholders in fluent-bit.conf + ruby /opt/omsagentwindows/scripts/ruby/td-agent-bit-conf-customizer.rb # run mdm config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser-mdm-metrics-config.rb @@ -418,18 +425,18 @@ function Get-ContainerRuntime { function Start-Fluent-Telegraf { - # Run fluent-bit service first so that we do not miss any logs being forwarded by the fluentd service and telegraf service. + $containerRuntime = Get-ContainerRuntime + + # Run fluent-bit service first so that we do not miss any logs being forwarded by the telegraf service. # Run fluent-bit as a background job. Switch this to a windows service once fluent-bit supports natively running as a windows service Start-Job -ScriptBlock { Start-Process -NoNewWindow -FilePath "C:\opt\fluent-bit\bin\fluent-bit.exe" -ArgumentList @("-c", "C:\etc\fluent-bit\fluent-bit.conf", "-e", "C:\opt\omsagentwindows\out_oms.so") } - $containerRuntime = Get-ContainerRuntime - #register fluentd as a service and start # there is a known issues with win32-service https://github.com/chef/win32-service/issues/70 if (![string]::IsNullOrEmpty($containerRuntime) -and [string]$containerRuntime.StartsWith('docker') -eq $false) { # change parser from docker to cri if the container runtime is not docker Write-Host "changing parser from Docker to CRI since container runtime : $($containerRuntime) and which is non-docker" - (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf', 'fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf + (Get-Content -Path C:/etc/fluent-bit/fluent-bit.conf -Raw) -replace 'docker', 'cri' | Set-Content C:/etc/fluent-bit/fluent-bit.conf } # Start telegraf only in sidecar scraping mode diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index 3e47b7eb2..8742fba8b 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -1,8 +1,3 @@ -# -################# Dangerous to use appveyor links - the builds are removed after 6 months -# -#ARG FLUENTBIT_URL=https://ci.appveyor.com/api/buildjobs/37lho3xf8j5i6crj/artifacts/build%2Ftd-agent-bit-1.4.0-win64.zip - Write-Host ('Creating folder structure') New-Item -Type Directory -Path /installation -ErrorAction SilentlyContinue @@ -21,7 +16,7 @@ Write-Host ('Creating folder structure') Write-Host ('Installing Fluent Bit'); try { - $fluentBitUri='https://github.com/microsoft/OMS-docker/releases/download/winakslogagent/td-agent-bit-1.4.0-win64.zip' + $fluentBitUri='https://fluentbit.io/releases/1.7/td-agent-bit-1.7.8-win64.zip' Invoke-WebRequest -Uri $fluentBitUri -OutFile /installation/td-agent-bit.zip Expand-Archive -Path /installation/td-agent-bit.zip -Destination /installation/fluent-bit Move-Item -Path /installation/fluent-bit/*/* -Destination /opt/fluent-bit/ -ErrorAction SilentlyContinue From 5b9988cab76c4f78af17dc240de4e08a489a1e97 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 21 Jan 2022 15:04:00 -0800 Subject: [PATCH 183/194] default to port 10250 & containerd for linux agent (#699) * default to port 10250 & containerd * fix pr feedback --- kubernetes/linux/main.sh | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index a9184ab53..980c15586 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -206,7 +206,7 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >> ~/.bashrc export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc - + #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD export MDSD_ODS_COMPRESSION_LEVEL=0 echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >> ~/.bashrc @@ -425,19 +425,24 @@ fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" -#Defaults to use port 10255 -cAdvisorIsSecure=false -RET_CODE=`wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^ HTTP/{print $2}'` -if [ $RET_CODE -eq 200 ]; then - cAdvisorIsSecure=true +#Defaults to use secure port: 10250 +cAdvisorIsSecure=true +RET_CODE=$(wget --server-response https://$NODE_IP:10250/stats/summary --no-check-certificate --header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" 2>&1 | awk '/^ HTTP/{print $2}') +if [ -z "$RET_CODE" ] || [ $RET_CODE -ne 200 ]; then + echo "Making wget request to cadvisor endpoint with port 10255 since failed with port 10250" + RET_CODE=$(wget --server-response http://$NODE_IP:10255/stats/summary 2>&1 | awk '/^ HTTP/{print $2}') + if [ ! -z "$RET_CODE" ] && [ $RET_CODE -eq 200 ]; then + cAdvisorIsSecure=false + fi fi -# default to docker since this is default in AKS as of now and change to containerd once this becomes default in AKS -export CONTAINER_RUNTIME="docker" +# default to containerd since this is common default in AKS and non-AKS +export CONTAINER_RUNTIME="containerd" export NODE_NAME="" + if [ "$cAdvisorIsSecure" = true ]; then - echo "Wget request using port 10250 succeeded. Using 10250" + echo "Using port 10250" export IS_SECURE_CADVISOR_PORT=true echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics" @@ -445,7 +450,7 @@ if [ "$cAdvisorIsSecure" = true ]; then echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') else - echo "Wget request using port 10250 failed. Using port 10255" + echo "Using port 10255" export IS_SECURE_CADVISOR_PORT=false echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics" @@ -460,10 +465,10 @@ if [ ! -z "$podWithValidContainerId" ]; then # convert to lower case so that everywhere else can be used in lowercase containerRuntime=$(echo $containerRuntime | tr "[:upper:]" "[:lower:]") nodeName=$(echo $nodeName | tr "[:upper:]" "[:lower:]") - # update runtime only if its not empty, not null and not startswith docker + # use default container runtime if obtained runtime value is either empty or null if [ -z "$containerRuntime" -o "$containerRuntime" == null ]; then echo "using default container runtime as $CONTAINER_RUNTIME since got containeRuntime as empty or null" - elif [[ $containerRuntime != docker* ]]; then + else export CONTAINER_RUNTIME=$containerRuntime fi From 4c460c6509b58cb942a2f8a17e6176453dbaf7cc Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Mon, 24 Jan 2022 10:24:48 -0800 Subject: [PATCH 184/194] Updating pod annotation for latest agent version (#697) --- kubernetes/omsagent.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 152f2313b..5a77f3563 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -357,7 +357,7 @@ spec: component: oms-agent tier: node annotations: - agentVersion: "1.10.0.1" + agentVersion: "azure-mdsd-1.14.2" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -596,7 +596,7 @@ spec: labels: rsName: "omsagent-rs" annotations: - agentVersion: "1.10.0.1" + agentVersion: "azure-mdsd-1.14.2" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -765,7 +765,7 @@ spec: component: oms-agent-win tier: node-win annotations: - agentVersion: "1.10.0.1" + agentVersion: "0.0.0-0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: From f2c2904b38117971030b8f3fddcb7ea3bdc10aa2 Mon Sep 17 00:00:00 2001 From: bragi92 Date: Wed, 26 Jan 2022 00:07:59 +0530 Subject: [PATCH 185/194] fix windows build failure due to msys2 version (#700) * fix windows build failure due to msys2 version * 20211130.0.0 --- kubernetes/windows/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 671a89246..55bedf7f5 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -10,7 +10,7 @@ ARG IMAGE_TAG=win-ciprod10132021 RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" # Fluentd depends on cool.io whose fat gem is only available for Ruby < 2.5, so need to specify --platform ruby when install Ruby > 2.5 and install msys2 to get dev tools RUN choco install -y ruby --version 2.6.5.1 --params "'/InstallDir:C:\ruby26'" \ -&& choco install -y msys2 --version 20210604.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ +&& choco install -y msys2 --version 20211130.0.0 --params "'/NoPath /NoUpdate /InstallDir:C:\ruby26\msys64'" \ && choco install -y vim # gangams - optional MSYS2 update via ridk failing in merged docker file so skipping that since we dont need optional update From 78440cfb0ea8674565fc6fc5fdf907da88082f89 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 27 Jan 2022 16:25:06 -0800 Subject: [PATCH 186/194] Jan agent tasks (#698) --- .github/workflows/pr-checker.yml | 2 +- build/linux/installer/conf/telegraf.conf | 2 +- charts/azuremonitor-containers/values.yaml | 2 +- kubernetes/linux/setup.sh | 7 ++++--- kubernetes/omsagent.yaml | 2 +- kubernetes/windows/setup.ps1 | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-checker.yml b/.github/workflows/pr-checker.yml index 8a7e542b3..723f22dc7 100644 --- a/.github/workflows/pr-checker.yml +++ b/.github/workflows/pr-checker.yml @@ -56,7 +56,7 @@ jobs: format: 'table' severity: 'CRITICAL,HIGH' vuln-type: 'os,library' - skip-dirs: '/opt,/usr/sbin' + skip-dirs: '/usr/sbin' exit-code: '1' timeout: '5m0s' WINDOWS-build: diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index 0e4824e70..b0a8730c6 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -425,7 +425,7 @@ # Below due to Bug - https://github.com/influxdata/telegraf/issues/5615 # ORDER matters here!! - i.e the below should be the LAST modifier [inputs.disk.tagdrop] - path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers", "/etc/config/settings"] + path = ["/var/lib/kubelet*", "/dev/termination-log", "/var/log", "/etc/hosts", "/etc/resolv.conf", "/etc/hostname", "/etc/kubernetes/host", "/var/lib/docker/containers", "/etc/config/settings", "/run/host/containerd/io.containerd.runtime.v2.task/k8s.io/*"] # Read metrics about memory usage diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 3ca313d38..d5d7ad2e1 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -178,7 +178,7 @@ omsagent: memory: 750Mi daemonsetwindows: limits: - cpu: 200m + cpu: 500m memory: 600Mi deployment: requests: diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 80a1b5b1d..872ac99cf 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -30,10 +30,10 @@ sudo apt-get install jq=1.5+dfsg-2 -y #used to setcaps for ruby process to read /proc/env sudo apt-get install libcap2-bin -y -wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz -tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz +wget https://dl.influxdata.com/telegraf/releases/telegraf-1.20.3_linux_amd64.tar.gz +tar -zxvf telegraf-1.20.3_linux_amd64.tar.gz -mv /opt/telegraf-1.18.0/usr/bin/telegraf /opt/telegraf +mv /opt/telegraf-1.20.3/usr/bin/telegraf /opt/telegraf chmod 777 /opt/telegraf @@ -61,6 +61,7 @@ rm -f $TMPDIR/docker-cimprov*.sh rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml rm -f $TMPDIR/envmdsd +rm -f $TMPDIR/telegraf-*.tar.gz # remove build dependencies sudo apt-get remove ruby2.6-dev gcc make -y diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 5a77f3563..248276a08 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -780,7 +780,7 @@ spec: imagePullPolicy: IfNotPresent resources: limits: - cpu: 200m + cpu: 500m memory: 600Mi env: - name: FBIT_SERVICE_FLUSH_INTERVAL diff --git a/kubernetes/windows/setup.ps1 b/kubernetes/windows/setup.ps1 index 8742fba8b..857f9f690 100644 --- a/kubernetes/windows/setup.ps1 +++ b/kubernetes/windows/setup.ps1 @@ -31,7 +31,7 @@ Write-Host ('Finished Installing Fluentbit') Write-Host ('Installing Telegraf'); try { - $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_windows_amd64.zip' + $telegrafUri='https://dl.influxdata.com/telegraf/releases/telegraf-1.20.3_windows_amd64.zip' Invoke-WebRequest -Uri $telegrafUri -OutFile /installation/telegraf.zip Expand-Archive -Path /installation/telegraf.zip -Destination /installation/telegraf Move-Item -Path /installation/telegraf/*/* -Destination /opt/telegraf/ -ErrorAction SilentlyContinue From 3dce72f090a5e866e4d688f73f50594b8dde3b9b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 27 Jan 2022 17:02:33 -0800 Subject: [PATCH 187/194] remove v1 fallback hidden option (#705) --- source/plugins/go/src/oms.go | 277 +++++++++++++++++------------------ 1 file changed, 136 insertions(+), 141 deletions(-) diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index ee221a60b..8c7695346 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -21,9 +21,10 @@ import ( "github.com/google/uuid" "github.com/tinylib/msgp/msgp" - lumberjack "gopkg.in/natefinch/lumberjack.v2" "Docker-Provider/source/plugins/go/src/extension" + lumberjack "gopkg.in/natefinch/lumberjack.v2" + "github.com/Azure/azure-kusto-go/kusto/ingest" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -85,7 +86,6 @@ const WindowsContainerLogPluginConfFilePath = "/etc/omsagentwindows/out_oms.conf // IPName const IPName = "ContainerInsights" - const defaultContainerInventoryRefreshInterval = 60 const kubeMonAgentConfigEventFlushInterval = 60 @@ -102,9 +102,6 @@ const ContainerLogsV2Route = "v2" const ContainerLogsADXRoute = "adx" -//fallback option v1 route i.e. ODS direct if required in any case -const ContainerLogsV1Route = "v1" - //container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) const ContainerLogV2SchemaVersion = "v2" @@ -252,29 +249,29 @@ type DataItemLAv1 struct { // DataItemLAv2 == ContainerLogV2 table in LA // Please keep the names same as destination column names, to avoid transforming one to another in the pipeline type DataItemLAv2 struct { - TimeGenerated string `json:"TimeGenerated"` - Computer string `json:"Computer"` - ContainerId string `json:"ContainerId"` - ContainerName string `json:"ContainerName"` - PodName string `json:"PodName"` - PodNamespace string `json:"PodNamespace"` - LogMessage string `json:"LogMessage"` - LogSource string `json:"LogSource"` + TimeGenerated string `json:"TimeGenerated"` + Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` //PodLabels string `json:"PodLabels"` } // DataItemADX == ContainerLogV2 table in ADX type DataItemADX struct { - TimeGenerated string `json:"TimeGenerated"` - Computer string `json:"Computer"` - ContainerId string `json:"ContainerId"` - ContainerName string `json:"ContainerName"` - PodName string `json:"PodName"` - PodNamespace string `json:"PodNamespace"` - LogMessage string `json:"LogMessage"` - LogSource string `json:"LogSource"` + TimeGenerated string `json:"TimeGenerated"` + Computer string `json:"Computer"` + ContainerId string `json:"ContainerId"` + ContainerName string `json:"ContainerName"` + PodName string `json:"PodName"` + PodNamespace string `json:"PodNamespace"` + LogMessage string `json:"LogMessage"` + LogSource string `json:"LogSource"` //PodLabels string `json:"PodLabels"` - AzureResourceId string `json:"AzureResourceId"` + AzureResourceId string `json:"AzureResourceId"` } // telegraf metric DataItem represents the object corresponding to the json that is sent by fluentbit tail plugin @@ -299,15 +296,15 @@ type InsightsMetricsBlob struct { // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type ContainerLogBlobLAv1 struct { - DataType string `json:"DataType"` - IPName string `json:"IPName"` + DataType string `json:"DataType"` + IPName string `json:"IPName"` DataItems []DataItemLAv1 `json:"DataItems"` } // ContainerLogBlob represents the object corresponding to the payload that is sent to the ODS end point type ContainerLogBlobLAv2 struct { - DataType string `json:"DataType"` - IPName string `json:"IPName"` + DataType string `json:"DataType"` + IPName string `json:"IPName"` DataItems []DataItemLAv2 `json:"DataItems"` } @@ -361,6 +358,7 @@ const ( // DataType to be used as enum per data type socket client creation type DataType int + const ( // DataType to be used as enum per data type socket client creation ContainerLogV2 DataType = iota @@ -628,12 +626,12 @@ func flushKubeMonAgentEventRecords() { Log(message) SendException(message) } else { - msgPackEntry := MsgPackEntry{ + msgPackEntry := MsgPackEntry{ Record: stringMap, } - msgPackEntries = append(msgPackEntries, msgPackEntry) - } - } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } + } } } @@ -670,8 +668,8 @@ func flushKubeMonAgentEventRecords() { msgPackEntry := MsgPackEntry{ Record: stringMap, } - msgPackEntries = append(msgPackEntries, msgPackEntry) - } + msgPackEntries = append(msgPackEntries, msgPackEntry) + } } } } @@ -713,18 +711,18 @@ func flushKubeMonAgentEventRecords() { } else { if err := json.Unmarshal(jsonBytes, &stringMap); err != nil { message := fmt.Sprintf("Error while UnMarshalling json bytes to stringmap: %s", err.Error()) - Log(message) - SendException(message) + Log(message) + SendException(message) } else { msgPackEntry := MsgPackEntry{ Record: stringMap, - } - msgPackEntries = append(msgPackEntries, msgPackEntry) + } + msgPackEntries = append(msgPackEntries, msgPackEntry) } } } } - if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + if IsWindows == false && len(msgPackEntries) > 0 { //for linux, mdsd route if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdKubeMonAgentEventsTagName, MdsdOutputStreamIdTagPrefix) == false { Log("Info::mdsd::obtaining output stream id for data type: %s", KubeMonAgentEventDataType) MdsdKubeMonAgentEventsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(KubeMonAgentEventDataType) @@ -757,7 +755,7 @@ func flushKubeMonAgentEventRecords() { } else { numRecords := len(msgPackEntries) Log("FlushKubeMonAgentEventRecords::Info::Successfully flushed %d records that was %d bytes in %s", numRecords, bts, elapsed) - // Send telemetry to AppInsights resource + // Send telemetry to AppInsights resource SendEvent(KubeMonAgentEventsFlushedEvent, telemetryDimensions) } } else { @@ -788,8 +786,8 @@ func flushKubeMonAgentEventRecords() { if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() - ingestionAuthToken := ODSIngestionAuthToken - IngestionAuthTokenUpdateMutex.Unlock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() if ingestionAuthToken == "" { Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") } @@ -910,77 +908,77 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int var msgPackEntries []MsgPackEntry var i int start := time.Now() - var elapsed time.Duration + var elapsed time.Duration for i = 0; i < len(laMetrics); i++ { - var interfaceMap map[string]interface{} - stringMap := make(map[string]string) - jsonBytes, err := json.Marshal(*laMetrics[i]) - if err != nil { - message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + var interfaceMap map[string]interface{} + stringMap := make(map[string]string) + jsonBytes, err := json.Marshal(*laMetrics[i]) + if err != nil { + message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:when marshalling json %q", err) + Log(message) + SendException(message) + return output.FLB_OK + } else { + if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { + message := fmt.Sprintf("Error while UnMarshalling json bytes to interfaceMap: %s", err.Error()) Log(message) SendException(message) return output.FLB_OK } else { - if err := json.Unmarshal(jsonBytes, &interfaceMap); err != nil { - message := fmt.Sprintf("Error while UnMarshalling json bytes to interfaceMap: %s", err.Error()) - Log(message) - SendException(message) - return output.FLB_OK - } else { - for key, value := range interfaceMap { - strKey := fmt.Sprintf("%v", key) - strValue := fmt.Sprintf("%v", value) - stringMap[strKey] = strValue - } - msgPackEntry := MsgPackEntry{ - Record: stringMap, - } - msgPackEntries = append(msgPackEntries, msgPackEntry) + for key, value := range interfaceMap { + strKey := fmt.Sprintf("%v", key) + strValue := fmt.Sprintf("%v", value) + stringMap[strKey] = strValue + } + msgPackEntry := MsgPackEntry{ + Record: stringMap, } + msgPackEntries = append(msgPackEntries, msgPackEntry) } + } } - if (len(msgPackEntries) > 0) { - if IsAADMSIAuthMode == true && (strings.HasPrefix(MdsdInsightsMetricsTagName, MdsdOutputStreamIdTagPrefix) == false) { - Log("Info::mdsd::obtaining output stream id for InsightsMetricsDataType since Log Analytics AAD MSI Auth Enabled") - MdsdInsightsMetricsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(InsightsMetricsDataType) - } - msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) + if len(msgPackEntries) > 0 { + if IsAADMSIAuthMode == true && (strings.HasPrefix(MdsdInsightsMetricsTagName, MdsdOutputStreamIdTagPrefix) == false) { + Log("Info::mdsd::obtaining output stream id for InsightsMetricsDataType since Log Analytics AAD MSI Auth Enabled") + MdsdInsightsMetricsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(InsightsMetricsDataType) + } + msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) + if MdsdInsightsMetricsMsgpUnixSocketClient == nil { + Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") + CreateMDSDClient(InsightsMetrics, ContainerType) if MdsdInsightsMetricsMsgpUnixSocketClient == nil { - Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") - CreateMDSDClient(InsightsMetrics, ContainerType) - if MdsdInsightsMetricsMsgpUnixSocketClient == nil { - Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") - ContainerLogTelemetryMutex.Lock() - defer ContainerLogTelemetryMutex.Unlock() - InsightsMetricsMDSDClientCreateErrors += 1 - return output.FLB_RETRY - } - } - - deadline := 10 * time.Second - MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse - bts, er := MdsdInsightsMetricsMsgpUnixSocketClient.Write(msgpBytes) - - elapsed = time.Since(start) - - if er != nil { - Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) - if MdsdInsightsMetricsMsgpUnixSocketClient != nil { - MdsdInsightsMetricsMsgpUnixSocketClient.Close() - MdsdInsightsMetricsMsgpUnixSocketClient = nil - } - + Log("Error::mdsd::Unable to create mdsd client for insights metrics. Please check error log.") ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() InsightsMetricsMDSDClientCreateErrors += 1 return output.FLB_RETRY - } else { - numTelegrafMetricsRecords := len(msgPackEntries) - UpdateNumTelegrafMetricsSentTelemetry(numTelegrafMetricsRecords, 0, 0) - Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) } + } + + deadline := 10 * time.Second + MdsdInsightsMetricsMsgpUnixSocketClient.SetWriteDeadline(time.Now().Add(deadline)) //this is based of clock time, so cannot reuse + bts, er := MdsdInsightsMetricsMsgpUnixSocketClient.Write(msgpBytes) + + elapsed = time.Since(start) + + if er != nil { + Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) + if MdsdInsightsMetricsMsgpUnixSocketClient != nil { + MdsdInsightsMetricsMsgpUnixSocketClient.Close() + MdsdInsightsMetricsMsgpUnixSocketClient = nil + } + + ContainerLogTelemetryMutex.Lock() + defer ContainerLogTelemetryMutex.Unlock() + InsightsMetricsMDSDClientCreateErrors += 1 + return output.FLB_RETRY + } else { + numTelegrafMetricsRecords := len(msgPackEntries) + UpdateNumTelegrafMetricsSentTelemetry(numTelegrafMetricsRecords, 0, 0) + Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) + } } } else { // for windows, ODS direct @@ -1117,12 +1115,12 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap = make(map[string]string) //below id & name are used by latency telemetry in both v1 & v2 LA schemas id := "" - name := "" + name := "" logEntry := ToString(record["log"]) logEntryTimeStamp := ToString(record["time"]) //ADX Schema & LAv2 schema are almost the same (except resourceId) - if (ContainerLogSchemaV2 == true || ContainerLogsRouteADX == true) { + if ContainerLogSchemaV2 == true || ContainerLogsRouteADX == true { stringMap["Computer"] = Computer stringMap["ContainerId"] = containerID stringMap["ContainerName"] = containerName @@ -1171,29 +1169,29 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { stringMap["AzureResourceId"] = "" } dataItemADX = DataItemADX{ - TimeGenerated: stringMap["TimeGenerated"], - Computer: stringMap["Computer"], - ContainerId: stringMap["ContainerId"], - ContainerName: stringMap["ContainerName"], - PodName: stringMap["PodName"], - PodNamespace: stringMap["PodNamespace"], - LogMessage: stringMap["LogMessage"], - LogSource: stringMap["LogSource"], - AzureResourceId: stringMap["AzureResourceId"], + TimeGenerated: stringMap["TimeGenerated"], + Computer: stringMap["Computer"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], + AzureResourceId: stringMap["AzureResourceId"], } //ADX dataItemsADX = append(dataItemsADX, dataItemADX) } else { - if (ContainerLogSchemaV2 == true) { + if ContainerLogSchemaV2 == true { dataItemLAv2 = DataItemLAv2{ - TimeGenerated: stringMap["TimeGenerated"], - Computer: stringMap["Computer"], - ContainerId: stringMap["ContainerId"], - ContainerName: stringMap["ContainerName"], - PodName: stringMap["PodName"], - PodNamespace: stringMap["PodNamespace"], - LogMessage: stringMap["LogMessage"], - LogSource: stringMap["LogSource"], + TimeGenerated: stringMap["TimeGenerated"], + Computer: stringMap["Computer"], + ContainerId: stringMap["ContainerId"], + ContainerName: stringMap["ContainerName"], + PodName: stringMap["PodName"], + PodNamespace: stringMap["PodNamespace"], + LogMessage: stringMap["LogMessage"], + LogSource: stringMap["LogSource"], } //ODS-v2 schema dataItemsLAv2 = append(dataItemsLAv2, dataItemLAv2) @@ -1211,10 +1209,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { Image: stringMap["Image"], Name: stringMap["Name"], } - //ODS-v1 schema - dataItemsLAv1 = append(dataItemsLAv1, dataItemLAv1) - name = stringMap["Name"] - id = stringMap["Id"] + //ODS-v1 schema + dataItemsLAv1 = append(dataItemsLAv1, dataItemLAv1) + name = stringMap["Name"] + id = stringMap["Id"] } } @@ -1364,18 +1362,18 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = len(dataItemsADX) Log("Success::ADX::Successfully wrote %d container log records to ADX in %s", numContainerLogRecords, elapsed) - } else if ((ContainerLogSchemaV2 == true && len(dataItemsLAv2) > 0) || len(dataItemsLAv1) > 0) { //ODS + } else if (ContainerLogSchemaV2 == true && len(dataItemsLAv2) > 0) || len(dataItemsLAv1) > 0 { //ODS var logEntry interface{} recordType := "" loglinesCount := 0 //schema v2 - if (len(dataItemsLAv2) > 0 && ContainerLogSchemaV2 == true) { + if len(dataItemsLAv2) > 0 && ContainerLogSchemaV2 == true { logEntry = ContainerLogBlobLAv2{ DataType: ContainerLogV2DataType, IPName: IPName, DataItems: dataItemsLAv2} - loglinesCount = len(dataItemsLAv2) - recordType = "ContainerLogV2" + loglinesCount = len(dataItemsLAv2) + recordType = "ContainerLogV2" } else { //schema v1 if len(dataItemsLAv1) > 0 { @@ -1383,8 +1381,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { DataType: ContainerLogDataType, IPName: IPName, DataItems: dataItemsLAv1} - loglinesCount = len(dataItemsLAv1) - recordType = "ContainerLog" + loglinesCount = len(dataItemsLAv1) + recordType = "ContainerLog" } } @@ -1416,7 +1414,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { return output.FLB_RETRY } // add authorization header to the req - req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) } resp, err := HTTPClient.Do(req) @@ -1444,7 +1442,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { numContainerLogRecords = loglinesCount Log("PostDataHelper::Info::Successfully flushed %d %s records to ODS in %s", numContainerLogRecords, recordType, elapsed) - } + } ContainerLogTelemetryMutex.Lock() defer ContainerLogTelemetryMutex.Unlock() @@ -1558,7 +1556,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Container Type %s", ContainerType) osType := os.Getenv("OS_TYPE") - IsWindows = false + IsWindows = false // Linux if strings.Compare(strings.ToLower(osType), "windows") != 0 { Log("Reading configuration for Linux from %s", pluginConfPath) @@ -1703,7 +1701,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { ContainerLogsRouteADX = false if strings.Compare(ContainerLogsRoute, ContainerLogsADXRoute) == 0 { - // Try to read the ADX database name from environment variables. Default to DefaultAdsDatabaseName if not set. + // Try to read the ADX database name from environment variables. Default to DefaultAdsDatabaseName if not set. // This SHOULD be set by tomlparser.rb so it's a highly unexpected event if it isn't. // It should be set by the logic in tomlparser.rb EVEN if ADX logging isn't enabled AdxDatabaseName := strings.TrimSpace(os.Getenv("AZMON_ADX_DATABASE_NAME")) @@ -1747,10 +1745,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { fmt.Fprintf(os.Stdout, "Routing container logs thru %s route...\n", ContainerLogsADXRoute) } } else if strings.Compare(strings.ToLower(osType), "windows") != 0 { //for linux, oneagent will be default route - ContainerLogsRouteV2 = true //default is mdsd route - if strings.Compare(ContainerLogsRoute, ContainerLogsV1Route) == 0 { - ContainerLogsRouteV2 = false //fallback option when hiddensetting set - } + ContainerLogsRouteV2 = true //default is mdsd route Log("Routing container logs thru %s route...", ContainerLogsRoute) fmt.Fprintf(os.Stdout, "Routing container logs thru %s route... \n", ContainerLogsRoute) } @@ -1768,14 +1763,14 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("Creating MDSD clients for KubeMonAgentEvents & InsightsMetrics") CreateMDSDClient(KubeMonAgentEvents, ContainerType) CreateMDSDClient(InsightsMetrics, ContainerType) - } + } ContainerLogSchemaVersion := strings.TrimSpace(strings.ToLower(os.Getenv("AZMON_CONTAINER_LOG_SCHEMA_VERSION"))) Log("AZMON_CONTAINER_LOG_SCHEMA_VERSION:%s", ContainerLogSchemaVersion) - ContainerLogSchemaV2 = false //default is v1 schema + ContainerLogSchemaV2 = false //default is v1 schema - if strings.Compare(ContainerLogSchemaVersion, ContainerLogV2SchemaVersion) == 0 && ContainerLogsRouteADX != true { + if strings.Compare(ContainerLogSchemaVersion, ContainerLogV2SchemaVersion) == 0 && ContainerLogsRouteADX != true { ContainerLogSchemaV2 = true Log("Container logs schema=%s", ContainerLogV2SchemaVersion) fmt.Fprintf(os.Stdout, "Container logs schema=%s... \n", ContainerLogV2SchemaVersion) @@ -1801,15 +1796,15 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { if ContainerLogSchemaV2 == true { MdsdContainerLogTagName = MdsdContainerLogV2SourceName } else { - MdsdContainerLogTagName = MdsdContainerLogSourceName - } + MdsdContainerLogTagName = MdsdContainerLogSourceName + } MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName - MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName Log("ContainerLogsRouteADX: %v, IsWindows: %v, IsAADMSIAuthMode = %v \n", ContainerLogsRouteADX, IsWindows, IsAADMSIAuthMode) if !ContainerLogsRouteADX && IsWindows && IsAADMSIAuthMode { Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) - IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) go refreshIngestionAuthToken() } } From 2726d01655055de56ce1c25fc8ece671427e1a4b Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 27 Jan 2022 18:40:11 -0800 Subject: [PATCH 188/194] collect telemetry containerlog records with emptystamp (#703) * collect telemetry containerlog records with emptystamp * collect telemetry containerlog records with emptystamp --- source/plugins/go/src/oms.go | 4 ++++ source/plugins/go/src/telemetry.go | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 8c7695346..fbee1dd75 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1229,6 +1229,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { maxLatencyContainer = name + "=" + id } } + } else { + ContainerLogTelemetryMutex.Lock() + ContainerLogRecordCountWithEmptyTimeStamp += 1 + ContainerLogTelemetryMutex.Unlock() } } diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 31818dbb3..b344f4ac8 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -50,6 +50,8 @@ var ( ContainerLogsSendErrorsToADXFromFluent float64 //Tracks the number of ADX client create errors for containerlogs (uses ContainerLogTelemetryTicker) ContainerLogsADXClientCreateErrors float64 + //Tracks the number of container log records with empty Timestamp (uses ContainerLogTelemetryTicker) + ContainerLogRecordCountWithEmptyTimeStamp float64 //Tracks the number of OSM namespaces and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) OSMNamespaceCount int //Tracks whether monitor kubernetes pods is set to true and sent only from prometheus sidecar (uses ContainerLogTelemetryTicker) @@ -82,6 +84,7 @@ const ( metricNameErrorCountKubeMonEventsMDSDClientCreateError = "KubeMonEventsMDSDClientCreateErrorsCount" metricNameErrorCountContainerLogsSendErrorsToADXFromFluent = "ContainerLogs2ADXSendErrorCount" metricNameErrorCountContainerLogsADXClientCreateError = "ContainerLogsADXClientCreateErrorCount" + metricNameContainerLogRecordCountWithEmptyTimeStamp = "ContainerLogRecordCountWithEmptyTimeStamp" defaultTelemetryPushIntervalSeconds = 300 @@ -125,6 +128,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { promMonitorPodsNamespaceLength := PromMonitorPodsNamespaceLength promMonitorPodsLabelSelectorLength := PromMonitorPodsLabelSelectorLength promMonitorPodsFieldSelectorLength := PromMonitorPodsFieldSelectorLength + containerLogRecordCountWithEmptyTimeStamp := ContainerLogRecordCountWithEmptyTimeStamp TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 @@ -142,6 +146,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { ContainerLogsADXClientCreateErrors = 0.0 InsightsMetricsMDSDClientCreateErrors = 0.0 KubeMonEventsMDSDClientCreateErrors = 0.0 + ContainerLogRecordCountWithEmptyTimeStamp = 0.0 ContainerLogTelemetryMutex.Unlock() if strings.Compare(strings.ToLower(os.Getenv("CONTROLLER_TYPE")), "daemonset") == 0 { @@ -222,6 +227,9 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { if kubeMonEventsMDSDClientCreateErrors > 0.0 { TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountKubeMonEventsMDSDClientCreateError, kubeMonEventsMDSDClientCreateErrors)) } + if ContainerLogRecordCountWithEmptyTimeStamp > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameContainerLogRecordCountWithEmptyTimeStamp, containerLogRecordCountWithEmptyTimeStamp)) + } start = time.Now() } From 28599b36376abbb69d11ace0689577280a2a923d Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 28 Jan 2022 13:59:31 -0800 Subject: [PATCH 189/194] Fixing telegraf bug for placeholder name (#706) --- kubernetes/windows/main.ps1 | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 733ddb408..f5fab4edd 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -52,23 +52,29 @@ function Set-EnvironmentVariables { if ($domain -eq "opinsights.azure.com") { $cloud_environment = "azurepubliccloud" $mcs_endpoint = "monitor.azure.com" - } elseif ($domain -eq "opinsights.azure.cn") { + } + elseif ($domain -eq "opinsights.azure.cn") { $cloud_environment = "azurechinacloud" $mcs_endpoint = "monitor.azure.cn" - } elseif ($domain -eq "opinsights.azure.us") { + } + elseif ($domain -eq "opinsights.azure.us") { $cloud_environment = "azureusgovernmentcloud" $mcs_endpoint = "monitor.azure.us" - } elseif ($domain -eq "opinsights.azure.eaglex.ic.gov") { + } + elseif ($domain -eq "opinsights.azure.eaglex.ic.gov") { $cloud_environment = "usnat" $mcs_endpoint = "monitor.azure.eaglex.ic.gov" - } elseif ($domain -eq "opinsights.azure.microsoft.scloud") { + } + elseif ($domain -eq "opinsights.azure.microsoft.scloud") { $cloud_environment = "ussec" $mcs_endpoint = "monitor.azure.microsoft.scloud" - } else { + } + else { Write-Host "Invalid or Unsupported domain name $($domain). EXITING....." exit 1 } - } else { + } + else { Write-Host "Domain name either null or empty. EXITING....." exit 1 } @@ -490,6 +496,11 @@ function Start-Telegraf { Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" } + $hostName = [System.Environment]::GetEnvironmentVariable("HOSTNAME", "process") + Write-Host "nodename: $($hostName)" + Write-Host "replacing nodename in telegraf config" + (Get-Content "C:\etc\telegraf\telegraf.conf").replace('placeholder_hostname', $hostName) | Set-Content "C:\etc\telegraf\telegraf.conf" + Write-Host "Installing telegraf service" C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" @@ -589,14 +600,16 @@ if (![string]::IsNullOrEmpty($requiresCertBootstrap) -and ` $isAADMSIAuth = [System.Environment]::GetEnvironmentVariable("USING_AAD_MSI_AUTH") if (![string]::IsNullOrEmpty($isAADMSIAuth) -and $isAADMSIAuth.ToLower() -eq 'true') { Write-Host "skipping agent onboarding via cert since AAD MSI Auth configured" -} else { +} +else { Generate-Certificates Test-CertificatePath } + Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 Get-WmiObject Win32_process | Where-Object { $_.Name -match 'powershell' } | Format-Table -Property Name, CommandLine, ProcessId #check if fluentd service is running -Get-Service fluentdwinaks +Get-Service fluentdwinaks \ No newline at end of file From 7452ee2767dabc54b7b3a787a3127b94a14fd0ae Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 28 Jan 2022 14:19:09 -0800 Subject: [PATCH 190/194] Gangams/jan 2022 release tasks 3 (#702) * add telemetry related to windows containers records * add telemetry related to windows containers records * containercount telemetry * add explicit exit code in ps scripts * node count telemetry * telemetry for win cirecord 64KB or more * metric to track wintelegraf metrics with tags 64kb * metric to track wintelegraf metrics with tags 64kb * fix pr feedback --- build/windows/Makefile.ps1 | 20 ++-- .../build-and-publish-dev-docker-image.ps1 | 8 +- .../build-and-publish-docker-image.ps1 | 8 +- .../dockerbuild/build-dev-base-image.ps1 | 6 +- .../windows/install-build-pre-requisites.ps1 | 16 ++-- .../aks/mdmonboarding/mdm_onboarding.ps1 | 22 ++--- .../mdmonboarding/mdm_onboarding_atscale.ps1 | 22 ++--- .../kubernetes/AddMonitoringWorkspaceTags.ps1 | 28 +++--- .../onboarding/managed/disable-monitoring.ps1 | 38 ++++---- .../onboarding/managed/enable-monitoring.ps1 | 40 ++++---- scripts/troubleshoot/TroubleshootError.ps1 | 86 ++++++++--------- .../TroubleshootError_AcsEngine.ps1 | 54 +++++------ .../TroubleshootError_nonAzureK8s.ps1 | 68 ++++++------- source/plugins/go/src/oms.go | 17 ++-- source/plugins/go/src/telemetry.go | 8 ++ source/plugins/ruby/constants.rb | 12 ++- source/plugins/ruby/in_cadvisor_perf.rb | 12 +-- source/plugins/ruby/in_kube_nodes.rb | 38 ++++---- source/plugins/ruby/in_kube_podinventory.rb | 96 +++++++++++++------ 19 files changed, 330 insertions(+), 269 deletions(-) diff --git a/build/windows/Makefile.ps1 b/build/windows/Makefile.ps1 index 9f3c438b0..52abbb071 100644 --- a/build/windows/Makefile.ps1 +++ b/build/windows/Makefile.ps1 @@ -13,21 +13,21 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } $builddir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $builddir + " ") if ($false -eq (Test-Path -Path $builddir)) { Write-Host("Invalid build dir : " + $builddir + " ") -ForegroundColor Red - exit + exit 1 } $versionFilePath = Join-Path -Path $builddir -child "version" Write-Host("versionFilePath : " + $versionFilePath + " ") if ($false -eq (Test-Path -Path $versionFilePath)) { Write-Host("Version file path incorrect or doesnt exist : " + $versionFilePath + " ") -ForegroundColor Red - exit + exit 1 } # read the version info @@ -36,7 +36,7 @@ foreach($line in Get-Content -Path $versionFilePath) { $parts = $line.split("=") if ($parts.length -lt 2 ) { Write-Host("Invalid content in version file : " + $versionFilePath + " ") -ForegroundColor Red - exit + exit 1 } switch ($parts[0]) { "CONTAINER_BUILDVERSION_MAJOR" { $BuildVersionMajor = $parts[1] } @@ -57,7 +57,7 @@ if ([string]::IsNullOrEmpty($BuildVersionMajor) -or [string]::IsNullOrEmpty($BuildVersionDate) -or [string]::IsNullOrEmpty($BuildVersionStatus)) { Write-Host("Expected version info doesnt exist in this version file : " + $versionFilePath + " ") -ForegroundColor Red - exit + exit 1 } # build version format will be [major].[minior].[patch]-[revision] $buildVersionString = $BuildVersionMajor + "." + $BuildVersionMinor + "." + $BuildVersionPatch + "-" + $BuildVersionBuildNR @@ -68,7 +68,7 @@ $certsrcdir = Join-Path -Path $builddir -ChildPath "windows\installer\certificat Write-Host("certsrc dir : " + $certsrcdir + " ") if ($false -eq (Test-Path -Path $certsrcdir)) { Write-Host("Invalid certificate generator source dir : " + $certsrcdir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host("set the cerificate generator source code directory : " + $certsrcdir + " ...") Set-Location -Path $certsrcdir @@ -100,13 +100,13 @@ Write-Host("Successfully published certificate generator code binaries") -Foregr $certreleasebinpath = Join-Path -PATH $certsrcdir -ChildPath "bin\Release\$dotnetcoreframework\win10-x64\publish\*.*" if ($false -eq (Test-Path -Path $certreleasebinpath)) { Write-Host("certificate release bin path doesnt exist : " + $certreleasebinpath + " ") -ForegroundColor Red - exit + exit 1 } $rootdir = Split-Path -Path $builddir if ($false -eq (Test-Path -Path $rootdir)) { Write-Host("Invalid docker provider root source dir : " + $rootdir + " ") -ForegroundColor Red - exit + exit 1 } $publishdir = Join-Path -Path $rootdir -ChildPath "kubernetes\windows\omsagentwindows" @@ -128,7 +128,7 @@ $outomsgoplugindir = Join-Path -Path $rootdir -ChildPath "source\plugins\go\src" Write-Host("Building Out_OMS go plugin code...") if ($false -eq (Test-Path -Path $outomsgoplugindir)) { Write-Host("Invalid Out oms go plugin code dir : " + $outomsgoplugindir + " ") -ForegroundColor Red - exit + exit 1 } Set-Location -Path $outomsgoplugindir @@ -178,7 +178,7 @@ if (Test-Path -Path $livenessprobeexepath){ Write-Host("livenessprobe.exe exists which indicates cpp build step succeeded") -ForegroundColor Green } else { Write-Host("livenessprobe.exe doesnt exist which indicates cpp build step failed") -ForegroundColor Red - exit + exit 1 } $installerdir = Join-Path -Path $builddir -ChildPath "common\installer" diff --git a/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 b/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 index 0fde7f379..b87132218 100644 --- a/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 +++ b/kubernetes/windows/dockerbuild/build-and-publish-dev-docker-image.ps1 @@ -15,18 +15,18 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($image)) { Write-Host "Image parameter shouldnt be null or empty" -ForegroundColor Red - exit + exit 1 } $imageparts = $image.split(":") if (($imageparts.Length -ne 2)){ Write-Host "Image not in valid format. Expected format should be /:" -ForegroundColor Red - exit + exit 1 } $imagetag = $imageparts[1].ToLower() @@ -48,7 +48,7 @@ $dockerFileDir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $dockerFileDir + " ") if ($false -eq (Test-Path -Path $dockerFileDir)) { Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "changing directory to DockerFile dir: $dockerFileDir" diff --git a/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 b/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 index dbcfa6097..c1f655882 100644 --- a/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 +++ b/kubernetes/windows/dockerbuild/build-and-publish-docker-image.ps1 @@ -15,18 +15,18 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($image)) { Write-Host "Image parameter shouldnt be null or empty" -ForegroundColor Red - exit + exit 1 } $imageparts = $image.split(":") if (($imageparts.Length -ne 2)){ Write-Host "Image not in valid format. Expected format should be /:" -ForegroundColor Red - exit + exit 1 } $imagetag = $imageparts[1].ToLower() @@ -48,7 +48,7 @@ $dockerFileDir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $dockerFileDir + " ") if ($false -eq (Test-Path -Path $dockerFileDir)) { Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "changing directory to DockerFile dir: $dockerFileDir" diff --git a/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 b/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 index 142e20c3f..4b17239d2 100644 --- a/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 +++ b/kubernetes/windows/dockerbuild/build-dev-base-image.ps1 @@ -1,6 +1,6 @@ <# .DESCRIPTION - Builds the Docker Image locally for the server core ltsc base and installs dependencies + Builds the Docker Image locally for the server core ltsc base and installs dependencies #> @@ -9,7 +9,7 @@ Write-Host("current script dir : " + $currentdir + " ") if ($false -eq (Test-Path -Path $currentdir)) { Write-Host("Invalid current dir : " + $currentdir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "start:Building the cert generator and out oms code via Makefile.ps1" @@ -20,7 +20,7 @@ $dockerFileDir = Split-Path -Path $currentdir Write-Host("builddir dir : " + $dockerFileDir + " ") if ($false -eq (Test-Path -Path $dockerFileDir)) { Write-Host("Invalid dockerFile Dir : " + $dockerFileDir + " ") -ForegroundColor Red - exit + exit 1 } Write-Host "changing directory to DockerFile dir: $dockerFileDir" diff --git a/scripts/build/windows/install-build-pre-requisites.ps1 b/scripts/build/windows/install-build-pre-requisites.ps1 index 7f1c9b54f..1ea316798 100755 --- a/scripts/build/windows/install-build-pre-requisites.ps1 +++ b/scripts/build/windows/install-build-pre-requisites.ps1 @@ -2,7 +2,7 @@ function Install-Go { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir PATH : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $tempGo = Join-Path -Path $tempDir -ChildPath "gotemp" @@ -10,7 +10,7 @@ function Install-Go { New-Item -Path $tempGo -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $tempGo)) { Write-Host("Invalid tempGo : " + $tempGo + " ") -ForegroundColor Red - exit + exit 1 } $url = "https://dl.google.com/go/go1.15.14.windows-amd64.msi" @@ -35,7 +35,7 @@ function Build-Dependencies { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir PATH : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $tempDependencies = Join-Path -Path $tempDir -ChildPath "gcctemp" @@ -43,7 +43,7 @@ function Build-Dependencies { New-Item -Path $tempDependencies -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $tempDependencies)) { Write-Host("Invalid temp Dir : " + $tempDependencies + " ") -ForegroundColor Red - exit + exit 1 } @@ -82,7 +82,7 @@ function Install-DotNetCoreSDK() { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $dotNetSdkTemp = Join-Path -Path $tempDir -ChildPath "dotNetSdk" @@ -90,7 +90,7 @@ function Install-DotNetCoreSDK() { New-Item -Path $dotNetSdkTemp -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $dotNetSdkTemp)) { Write-Host("Invalid dotNetSdkTemp : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $url = "https://download.visualstudio.microsoft.com/download/pr/4e88f517-196e-4b17-a40c-2692c689661d/eed3f5fca28262f764d8b650585a7278/dotnet-sdk-3.1.301-win-x64.exe" @@ -110,7 +110,7 @@ function Install-Docker() { $tempDir = $env:TEMP if ($false -eq (Test-Path -Path $tempDir)) { Write-Host("Invalid TEMP dir PATH : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $dockerTemp = Join-Path -Path $tempDir -ChildPath "docker" @@ -118,7 +118,7 @@ function Install-Docker() { New-Item -Path $dockerTemp -ItemType "directory" -Force -ErrorAction Stop if ($false -eq (Test-Path -Path $dockerTemp)) { Write-Host("Invalid dockerTemp : " + $tempDir + " ") -ForegroundColor Red - exit + exit 1 } $url = "https://download.docker.com/win/stable/Docker%20Desktop%20Installer.exe" diff --git a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 index dcf73f098..a5d95c31e 100644 --- a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 +++ b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding.ps1 @@ -39,7 +39,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } @@ -66,7 +66,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -77,7 +77,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -88,7 +88,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -103,7 +103,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -114,7 +114,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAksModule) { @@ -124,7 +124,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az catch { Write-Host("Could not import Az.Aks... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -132,7 +132,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azAksModule) -or ($null -eq $az 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -161,7 +161,7 @@ if ($account.Account -eq $null) { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -181,7 +181,7 @@ else { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -196,7 +196,7 @@ if ($notPresent) { Write-Host("Could not find Aks cluster. Please make sure that specified cluster exists: '" + $clusterName + "'is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked specified cluster exists details...") -ForegroundColor Green diff --git a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 index a791bb18e..32311ca61 100644 --- a/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 +++ b/scripts/onboarding/aks/mdmonboarding/mdm_onboarding_atscale.ps1 @@ -30,7 +30,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } @@ -57,7 +57,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -68,7 +68,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -79,7 +79,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -94,7 +94,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -105,7 +105,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAksModule) { @@ -115,7 +115,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ catch { Write-Host("Could not import Az.Aks... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -123,7 +123,7 @@ if (($null -eq $azAccountModule) -or ( $null -eq $azAksModule ) -or ($null -eq $ 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -152,7 +152,7 @@ if ($account.Account -eq $null) { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -172,7 +172,7 @@ else { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -187,7 +187,7 @@ if ($notPresent) { Write-Host("Failed to get Aks clusters in specified subscription. Please make sure that you have access to the existing clusters") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully got all aks clusters ...") -ForegroundColor Green diff --git a/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 b/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 index 29f629878..a0965f960 100644 --- a/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 +++ b/scripts/onboarding/aksengine/kubernetes/AddMonitoringWorkspaceTags.ps1 @@ -64,7 +64,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -89,7 +89,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } try { Write-Host("Installing Az.Accounts...") @@ -97,7 +97,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -109,7 +109,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { Write-Host("Could not import Az.Resources ...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } try { Import-Module Az.Accounts @@ -117,14 +117,14 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule)) { catch { Write-Host("Could not import Az.Accounts... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -138,7 +138,7 @@ if ($NameoftheCloud -like "AzureCloud" -or } else { Write-Host("Error: Monitoring not supported in this cloud: $NameoftheCloud") -ForegroundColor Red - exit + exit 1 } # @@ -151,7 +151,7 @@ if ($notPresent) { Write-Host("Could not find RG. Please make sure that the resource group name: '" + $ResourceGroupName + "'is correct and you have access to the aks-engine cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked resource groups details...") -ForegroundColor Green @@ -179,20 +179,20 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { } else { Write-Host("Resource group name: '" + $ResourceGroupName + "'is doesnt have the aks-engine resources") -ForegroundColor Red - exit + exit 1 } } if ($isKubernetesCluster -eq $false) { Write-Host("Resource group name: '" + $ResourceGroupName + "' doesnt have the aks-engine or acs-engine resources") -ForegroundColor Red - exit + exit 1 } # validate specified logAnalytics workspace exists or not $workspaceResource = Get-AzResource -ResourceId $LogAnalyticsWorkspaceResourceId if ($null -eq $workspaceResource) { Write-Host("Specified Log Analytics workspace ResourceId: '" + $LogAnalyticsWorkspaceResourceId + "' doesnt exist or don't have access to it") -ForegroundColor Red - exit + exit 1 } # @@ -202,11 +202,11 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { $r = Get-AzResource -ResourceGroupName $ResourceGroupName -ResourceName $k8MasterVM.Name if ($null -eq $r) { Write-Host("Get-AzResource for Resource Group: " + $ResourceGroupName + "Resource Name :" + $k8MasterVM.Name + " failed" ) -ForegroundColor Red - exit + exit 1 } if ($null -eq $r.Tags) { Write-Host("K8s master VM should have the tags" ) -ForegroundColor Red - exit + exit 1 } if ($r.Tags.ContainsKey("logAnalyticsWorkspaceResourceId")) { $existingLogAnalyticsWorkspaceResourceId = $r.Tags["logAnalyticsWorkspaceResourceId"] @@ -225,7 +225,7 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { $existingclusterName = $r.Tags["clusterName"] if ($existingclusterName -eq $ClusterName) { Write-Host("Ignoring attaching clusterName tag to K8s master VM :" + $k8MasterVM.Name + " since it has already with same tag value" ) -ForegroundColor Yellow - exit + exit 1 } Write-Host("K8s master VM :" + $k8MasterVM.Name + " has the existing tag for clusterName with different from specified one" ) -ForegroundColor Green $r.Tags.Remove("clusterName") diff --git a/scripts/onboarding/managed/disable-monitoring.ps1 b/scripts/onboarding/managed/disable-monitoring.ps1 index bcd135dba..8be60c50d 100644 --- a/scripts/onboarding/managed/disable-monitoring.ps1 +++ b/scripts/onboarding/managed/disable-monitoring.ps1 @@ -61,7 +61,7 @@ if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { } else { Write-Host("Specified Azure Cloud name is : $azureCloudName") Write-Host("Only supported Azure clouds are : AzureCloud and AzureUSGovernment") - exit + exit 1 } } @@ -89,7 +89,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -116,7 +116,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -127,7 +127,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -139,7 +139,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -154,7 +154,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -165,7 +165,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -176,7 +176,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -184,14 +184,14 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } if ([string]::IsNullOrEmpty($clusterResourceId)) { Write-Host("Specified Azure ClusterResourceId should not be NULL or empty") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($kubeContext)) { @@ -211,7 +211,7 @@ if ($clusterResourceId.StartsWith("/") -eq $false) { if ($clusterResourceId.Split("/").Length -ne 9){ Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red - exit + exit 1 } if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -ne $true) -and @@ -219,7 +219,7 @@ if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedcluste ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -ne $true) ) { Write-Host("Provided cluster ResourceId is not supported cluster type: $clusterResourceId") -ForegroundColor Red - exit + exit 1 } if ($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -eq $true) { @@ -284,7 +284,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -304,7 +304,7 @@ else { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -314,7 +314,7 @@ Write-Host("Checking specified Azure Managed cluster resource exists and got acc $clusterResource = Get-AzResource -ResourceId $clusterResourceId if ($null -eq $clusterResource) { Write-Host("specified Azure Managed cluster resource id either you dont have access or doesnt exist") -ForegroundColor Red - exit + exit 1 } $clusterRegion = $clusterResource.Location.ToLower() @@ -323,7 +323,7 @@ if ($isArcK8sCluster -eq $true) { $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() if ($clusterIdentity.Contains("systemassigned") -eq $false) { Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red - exit + exit 1 } } @@ -345,7 +345,7 @@ try { $releases = helm list --filter $helmChartReleaseName if ($releases.Count -lt 2) { Write-Host("There is no existing release with name : $helmChartReleaseName") -ForegroundColor Yellow - exit + exit 1 } for($index =0 ; $index -lt $releases.Count ; $index ++ ) { @@ -360,7 +360,7 @@ try { $releases = helm list --filter $helmChartReleaseName --kube-context $kubeContext if ($releases.Count -lt 2) { Write-Host("There is no existing release with name : $helmChartReleaseName") -ForegroundColor Yellow - exit + exit 1 } for($index =0 ; $index -lt $releases.Count ; $index ++ ) { @@ -374,7 +374,7 @@ try { } catch { Write-Host ("Failed to delete Azure Monitor for containers HELM chart : '" + $Error[0] + "' ") -ForegroundColor Red - exit + exit 1 } Write-Host("Successfully disabled Azure Monitor for containers for cluster: $clusteResourceId") -ForegroundColor Green diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index e79ef2138..27bc2fd62 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -81,7 +81,7 @@ if ([string]::IsNullOrEmpty($azureCloudName) -eq $true) { } else { Write-Host("Specified Azure Cloud name is : $azureCloudName") Write-Host("Only supported azure clouds are : AzureCloud and AzureUSGovernment") - exit + exit 1 } } @@ -109,7 +109,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -136,7 +136,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -147,7 +147,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -159,7 +159,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } @@ -174,7 +174,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -185,7 +185,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -196,7 +196,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -204,14 +204,14 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } if ([string]::IsNullOrEmpty($clusterResourceId)) { Write-Host("Specified Azure Arc enabled Kubernetes ClusterResourceId should not be NULL or empty") -ForegroundColor Red - exit + exit 1 } if ([string]::IsNullOrEmpty($kubeContext)) { @@ -232,7 +232,7 @@ if ($clusterResourceId.StartsWith("/") -eq $false) { if ($clusterResourceId.Split("/").Length -ne 9) { Write-Host("Provided Cluster Resource Id is not in expected format") -ForegroundColor Red - exit + exit 1 } if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedclusters") -ne $true) -and @@ -240,7 +240,7 @@ if (($clusterResourceId.ToLower().Contains("microsoft.kubernetes/connectedcluste ($clusterResourceId.ToLower().Contains("microsoft.containerservice/managedclusters") -ne $true) ) { Write-Host("Provided cluster ResourceId is not supported cluster type: $clusterResourceId") -ForegroundColor Red - exit + exit 1 } if (([string]::IsNullOrEmpty($servicePrincipalClientId) -eq $false) -and @@ -305,7 +305,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -325,7 +325,7 @@ else { Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -335,7 +335,7 @@ Write-Host("Checking specified Azure Managed cluster resource exists and got acc $clusterResource = Get-AzResource -ResourceId $clusterResourceId if ($null -eq $clusterResource) { Write-Host("specified Azure Managed cluster resource id either you dont have access or doesnt exist") -ForegroundColor Red - exit + exit 1 } $clusterRegion = $clusterResource.Location.ToLower() @@ -344,7 +344,7 @@ if ($isArcK8sCluster -eq $true) { $clusterIdentity = $clusterResource.identity.type.ToString().ToLower() if ($clusterIdentity.contains("systemassigned") -eq $false) { Write-Host("Identity of Azure Arc enabled Kubernetes cluster should be systemassigned but it has identity: $clusterIdentity") -ForegroundColor Red - exit + exit 1 } } @@ -450,7 +450,7 @@ else { Write-Host("using specified Log Analytics Workspace ResourceId: '" + $workspaceResourceId + "' ") if ([string]::IsNullOrEmpty($workspaceResourceId)) { Write-Host("Specified workspaceResourceId should not be NULL or empty") -ForegroundColor Red - exit + exit 1 } $workspaceResourceId = $workspaceResourceId.Trim() if ($workspaceResourceId.EndsWith("/")) { @@ -465,7 +465,7 @@ else { if (($workspaceResourceId.ToLower().Contains("microsoft.operationalinsights/workspaces") -ne $true) -or ($workspaceResourceId.Split("/").Length -ne 9)) { Write-Host("Provided workspace resource id should be in this format /subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/") -ForegroundColor Red - exit + exit 1 } $workspaceResourceParts = $workspaceResourceId.Split("/") @@ -482,7 +482,7 @@ else { $WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroup -Name $workspaceName -ErrorAction SilentlyContinue if ($null -eq $WorkspaceInformation) { Write-Host("Specified Log Analytics Workspace: '" + $workspaceName + "' in Resource Group: '" + $workspaceResourceGroup + "' in Subscription: '" + $workspaceSubscriptionId + "' does not exist") -ForegroundColor Red - exit + exit 1 } } @@ -520,7 +520,7 @@ try { } catch { Write-Host ("Failed to workspace details. Please validate whether you have Log Analytics Contributor role on the workspace error: '" + $Error[0] + "' ") -ForegroundColor Red - exit + exit 1 } diff --git a/scripts/troubleshoot/TroubleshootError.ps1 b/scripts/troubleshoot/TroubleshootError.ps1 index 4c2d95ac6..6d97c53d5 100644 --- a/scripts/troubleshoot/TroubleshootError.ps1 +++ b/scripts/troubleshoot/TroubleshootError.ps1 @@ -35,7 +35,7 @@ if (($null -eq $ClusterResourceId) -or ($ClusterResourceId.Split("/").Length -ne Write-Host("Resource Id Format for AKS cluster is : /subscriptions//resourceGroups//providers/Microsoft.ContainerService/managedClusters/") -ForegroundColor Red Write-Host("Resource Id Format for ARO cluster is : /subscriptions//resourceGroups//providers/Microsoft.ContainerService/openShiftManagedClusters/") -ForegroundColor Red Stop-Transcript - exit + exit 1 } $isClusterAndWorkspaceInDifferentSubs = $false @@ -70,7 +70,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -97,7 +97,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.ResourceGraph in a new powershell window: eg. 'Install-Module Az.ResourceGraph -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAksModule) { @@ -108,7 +108,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -120,7 +120,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -132,7 +132,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -145,7 +145,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -159,7 +159,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not Import Az.ResourceGraph...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.ResourceGraph in a new powershell window: eg. 'Install-Module Az.ResourceGraph -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -171,7 +171,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not Import Az.Aks...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Aks in a new powershell window: eg. 'Install-Module Az.Aks -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -183,7 +183,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -194,7 +194,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -205,7 +205,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -213,7 +213,7 @@ if (($null -eq $azAksModule) -or ($null -eq $azARGModule) -or ($null -eq $azAcco 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -277,7 +277,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $ClusterSubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -297,7 +297,7 @@ else { Write-Host("Could not select subscription with ID : " + $ClusterSubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -313,7 +313,7 @@ if ($notPresent) { Write-Host("Could not find RG. Please make sure that the resource group name: '" + $ResourceGroupName + "'is correct and you have access to the Resource Group") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked resource groups details...") -ForegroundColor Green @@ -327,7 +327,7 @@ try { Write-Host("Could not fetch cluster details: Please make sure that the '" + $ClusterType + "' Cluster name: '" + $ClusterName + "' is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } else { Write-Host("Successfully checked '" + $ClusterType + "' Cluster details...") -ForegroundColor Green @@ -342,7 +342,7 @@ try { Write-Host($AksOptInLink) -ForegroundColor Red; Write-Host(""); Stop-Transcript - exit + exit 1 } $omsagentconfig = $props.addonprofiles.omsagent.config; @@ -364,7 +364,7 @@ try { Write-Host("Could not fetch cluster details: Please make sure that the '" + $ClusterType + "' Cluster name: '" + $ClusterName + "' is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $monitorProfile = $ResourceDetail.aroproperties.monitorprofile @@ -373,7 +373,7 @@ try { Write-Host($AksOptInLink) -ForegroundColor Red; Write-Host(""); Stop-Transcript - exit + exit 1 } $LogAnalyticsWorkspaceResourceID = $monitorProfile.workspaceresourceid @@ -385,7 +385,7 @@ catch { Write-Host("Could not fetch cluster details: Please make sure that the '" + $ClusterType + "' Cluster name: '" + $ClusterName + "' is correct and you have access to the cluster") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -511,7 +511,7 @@ if ($null -eq $LogAnalyticsWorkspaceResourceID) { } Write-Host("") Stop-Transcript - exit + exit 1 } else { @@ -532,7 +532,7 @@ else { Write-Host("Could not change to Workspace subscriptionId : '" + $workspaceSubscriptionId + "'." ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -557,7 +557,7 @@ else { } Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace subcription details...") -ForegroundColor Green Write-Host("") @@ -581,7 +581,7 @@ else { Write-Host("Opt-in - " + $AksOptInLink) -ForegroundColor Red } Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace resource group...") -ForegroundColor Green Write-Host("") @@ -610,7 +610,7 @@ else { } Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspaceLocation = $WorkspaceInformation.Location @@ -619,7 +619,7 @@ else { Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspacePricingTier = $WorkspaceInformation.sku @@ -635,7 +635,7 @@ else { Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } try { @@ -647,7 +647,7 @@ else { Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] @@ -711,7 +711,7 @@ try { if ($WorkspaceUsage.CurrentValue -ge $WorkspaceUsage.Limit) { Write-Host("Workspace usage has reached or over the configured daily cap. Please increase the daily cap limits or wait for next reset interval") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } Write-Host("Workspace doesnt have daily cap configured") -ForegroundColor Green @@ -720,7 +720,7 @@ catch { Write-Host("Failed to get usage details of the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -757,7 +757,7 @@ if ("AKS" -eq $ClusterType ) { Write-Host($AksOptInLink) -ForegroundColor Red Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } $rsPodStatus = $rsPod.status @@ -778,7 +778,7 @@ if ("AKS" -eq $ClusterType ) { Write-Host($AksOptInLink) -ForegroundColor Red Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent replicaset pod running OK.") -ForegroundColor Green @@ -786,7 +786,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to get omsagent replicatset pod info using kubectl get rs : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent daemonset pod running correctly ...") @@ -795,7 +795,7 @@ if ("AKS" -eq $ClusterType ) { if (($null -eq $ds) -or ($null -eq $ds.Items) -or ($ds.Items.Length -ne 1)) { Write-Host( "omsagent replicaset pod not scheduled or failed to schedule." + $contactUSMessage) Stop-Transcript - exit + exit 1 } $dsStatus = $ds.Items[0].status @@ -809,7 +809,7 @@ if ("AKS" -eq $ClusterType ) { Write-Host($dsStatus) Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent daemonset pod running OK.") -ForegroundColor Green @@ -817,7 +817,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent heatlhservice running correctly ...") @@ -826,7 +826,7 @@ if ("AKS" -eq $ClusterType ) { if ($healthservice.Items.Length -ne 1) { Write-Host( "omsagent healthservice not scheduled or failed to schedule." + $contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent healthservice running OK.") -ForegroundColor Green @@ -834,7 +834,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to execute kubectl get services command : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ($isClusterAndWorkspaceInDifferentSubs) { @@ -851,7 +851,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to get workspace details. Please validate whether you have Log Analytics Contributor role on the workspace error: '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the WorkspaceGuid and key matching with configured log analytics workspace ...") @@ -862,7 +862,7 @@ if ("AKS" -eq $ClusterType ) { if ((($workspaceGuidConfiguredOnAgent -eq $workspaceGUID) -and ($workspaceKeyConfiguredOnAgent -eq $workspacePrimarySharedKey)) -eq $false) { Write-Host ("Error - Log Analytics Workspace Guid and key configured on the agent not matching with details of the Workspace. Please verify and fix with the correct workspace Guid and Key") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Workspace Guid and Key on the agent matching with the Workspace") -ForegroundColor Green @@ -870,7 +870,7 @@ if ("AKS" -eq $ClusterType ) { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking agent version...") @@ -885,7 +885,7 @@ if ("AKS" -eq $ClusterType ) { } catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } diff --git a/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 b/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 index 1f1e1ba5d..5662d3f79 100644 --- a/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 +++ b/scripts/troubleshoot/TroubleshootError_AcsEngine.ps1 @@ -45,7 +45,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o else { Write-Host("Please run the script as an administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } $message = "This script will try to install the latest versions of the following Modules : ` @@ -69,7 +69,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o } catch { Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.profile in a new powershell window: eg. 'Install-Module AzureRM.profile -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } try { Write-Host("Installing AzureRM.Resources...") @@ -77,7 +77,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o } catch { Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.Resoureces in a new powershell window: eg. 'Install-Module AzureRM.Resoureces -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } try { @@ -86,7 +86,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o } catch { Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.OperationalInsights in a new powershell window: eg. 'Install-Module AzureRM.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red - exit + exit 1 } } 1 { @@ -97,7 +97,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o Write-Host("Could not import AzureRM.profile...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.profile in a new powershell window: eg. 'Install-Module AzureRM.profile -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } try { Import-Module AzureRM.Resources @@ -105,7 +105,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o catch { Write-Host("Could not import AzureRM.Resources... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } try { Import-Module AzureRM.OperationalInsights @@ -113,7 +113,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o catch { Write-Host("Could not import AzureRM.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Running troubleshooting script... Please reinstall this Module") Write-Host("") @@ -121,7 +121,7 @@ if (($null -eq $azureRmProfileModule) -or ($null -eq $azureRmResourcesModule) -o 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -151,7 +151,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -171,7 +171,7 @@ else { Write-Host("Could not select subscription with ID : " + $SubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the Subscription" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -187,7 +187,7 @@ if ($notPresent) { Write-Host("Could not find RG. Please make sure that the resource group name: '" + $ResourceGroupName + "'is correct and you have access to the Resource Group") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully checked resource groups details...") -ForegroundColor Green @@ -197,13 +197,13 @@ Write-Host("Successfully checked resource groups details...") -ForegroundColor G if ([string]::IsNullOrEmpty($KubeConfig)) { Write-Host("KubeConfig should not be NULL or empty") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ((Test-Path $KubeConfig -PathType Leaf) -ne $true) { Write-Host("provided KubeConfig path : '" + $KubeConfig + "' doesnt exist or you dont have read access") -ForegroundColor Red Stop-Transcript - exit + exit 1 } # @@ -249,13 +249,13 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { } else { Write-Host("This Resource group : '" + $ResourceGroupName + "'does not have the AKS-engine or ACS-Engine Kubernetes resources") -ForegroundColor Red - exit + exit 1 } } if ($isKubernetesCluster -eq $false) { Write-Host("Monitoring only supported for AKS-Engine or ACS-Engine with Kubernetes") -ForegroundColor Red - exit + exit 1 } Write-Host("Successfully checked the AKS-Engine or ACS-Engine Kuberentes cluster resources in specified resource group") -ForegroundColor Green @@ -270,7 +270,7 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { if ($null -eq $r) { Write-Host("Get-AzureRmResource for Resource Group: " + $ResourceGroupName + "Resource Name :" + $k8MasterVM.Name + " failed" ) -ForegroundColor Red - exit + exit 1 } if ($null -eq $r.Tags) { @@ -279,7 +279,7 @@ foreach ($k8MasterVM in $k8sMasterVMsOrVMSSes) { Write-Host("Please try to opt out of monitoring and opt-in using the following links:") -ForegroundColor Red Write-Host("Opt-out - " + $OptOutLink) -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red - exit + exit 1 } if ($r.Tags.ContainsKey("logAnalyticsWorkspaceResourceId")) { @@ -300,7 +300,7 @@ if ($null -eq $LogAnalyticsWorkspaceResourceID) { Write-Host("There is no existing logAnalyticsWorkspaceResourceId tag on AKS-Engine k8 master nodes or VMSSes so this indicates this cluster not enabled monitoring or tags have been removed" ) -ForegroundColor Red Write-Host("Please try to opt-in for monitoring using the following links:") -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red - exit + exit 1 } else { @@ -309,7 +309,7 @@ else { Write-Host("Please add the clusterName tag with the value of clusterName used during the omsagent agent onboarding. Refer below link for details:") -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red - exit + exit 1 } Write-Host("Configured LogAnalyticsWorkspaceResourceId: : '" + $LogAnalyticsWorkspaceResourceID + "' ") @@ -328,7 +328,7 @@ else { Write-Host("Could not change to Workspace subscriptionId : '" + $workspaceSubscriptionId + "'." ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } @@ -347,7 +347,7 @@ else { Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace subcription details...") -ForegroundColor Green Write-Host("") @@ -364,7 +364,7 @@ else { Write-Host("Opt-out - " + $OptOutLink) -ForegroundColor Red Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Successfully fetched workspace resource group...") -ForegroundColor Green Write-Host("") @@ -386,7 +386,7 @@ else { Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspaceLocation = $WorkspaceInformation.Location @@ -396,7 +396,7 @@ else { Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspacePricingTier = $WorkspaceInformation.sku @@ -413,7 +413,7 @@ else { Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } try { @@ -425,7 +425,7 @@ else { Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] @@ -498,7 +498,7 @@ try { } catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("") diff --git a/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 b/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 index 14b080b23..76bbad16c 100644 --- a/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 +++ b/scripts/troubleshoot/TroubleshootError_nonAzureK8s.ps1 @@ -47,25 +47,25 @@ Write-Host("LogAnalyticsWorkspaceResourceId: : '" + $azureLogAnalyticsWorkspaceR if (($azureLogAnalyticsWorkspaceResourceId.ToLower().Contains("microsoft.operationalinsights/workspaces") -ne $true) -or ($azureLogAnalyticsWorkspaceResourceId.Split("/").Length -ne 9)) { Write-Host("Provided Azure Log Analytics resource id should be in this format /subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ([string]::IsNullOrEmpty($kubeConfig)) { Write-Host("kubeConfig should not be NULL or empty") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ((Test-Path $kubeConfig -PathType Leaf) -ne $true) { Write-Host("provided kubeConfig path : '" + $kubeConfig + "' doesnt exist or you dont have read access") -ForegroundColor Red Stop-Transcript - exit + exit 1 } if ([string]::IsNullOrEmpty($clusterContextInKubeconfig)) { Write-Host("provide clusterContext should be valid context in the provided kubeconfig") -ForegroundColor Red Stop-Transcript - exit + exit 1 } # checks the all required Powershell modules exist and if not exists, request the user permission to install @@ -92,7 +92,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - else { Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -120,7 +120,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -132,7 +132,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -145,7 +145,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Close other powershell logins and try installing the latest modules for Az.OperationalInsights in a new powershell window: eg. 'Install-Module Az.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -160,7 +160,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Resources...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } if ($null -eq $azAccountModule) { @@ -171,7 +171,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Could not import Az.Accounts...") -ForegroundColor Red Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -182,7 +182,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - catch { Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -190,7 +190,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - 2 { Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -222,7 +222,7 @@ if ($null -eq $account.Account) { Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the SubscriptionId you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } else { @@ -242,7 +242,7 @@ else { Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } } } @@ -253,7 +253,7 @@ $workspaceResource = Get-AzResource -ResourceId $azureLogAnalyticsWorkspaceResou if ($null -eq $workspaceResource) { Write-Host("specified Azure Log Analytics resource id: " + $azureLogAnalyticsWorkspaceResourceId + ". either you dont have access or doesnt exist") -ForegroundColor Red Stop-Transcript - exit + exit 1 } # @@ -272,7 +272,7 @@ catch { Write-Host("Opt-in - " + $OptInLink) -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspaceLocation = $WorkspaceInformation.Location @@ -281,7 +281,7 @@ if ($null -eq $WorkspaceLocation) { Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $WorkspacePricingTier = $WorkspaceInformation.sku @@ -297,7 +297,7 @@ catch { Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } try { @@ -309,7 +309,7 @@ catch { Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red Write-Host("") Stop-Transcript - exit + exit 1 } $isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] @@ -317,7 +317,7 @@ if ($isSolutionOnboarded) { if ($WorkspacePricingTier -eq "Free") { Write-Host("Pricing tier of the configured LogAnalytics workspace is Free so you may need to upgrade to pricing tier to non-Free") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } else { @@ -356,13 +356,13 @@ else { Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red Write-Host($contactUSMessage) -ForegroundColor Red Stop-Transcript - exit + exit 1 } } else { Write-Host("The container health solution isn't onboarded to your cluster. This required for the monitoring to work.") -ForegroundColor Red Stop-Transcript - exit + exit 1 } } @@ -382,7 +382,7 @@ try { if ($null -eq $rsPod) { Write-Host( "omsagent replicaset pod not scheduled or failed to scheduled." + $contactUSMessage) -ForegroundColor Red Stop-Transcript - exit + exit 1 } $rsPodStatus = $rsPod.status if ((($rsPodStatus.availableReplicas -eq 1) -and @@ -393,7 +393,7 @@ try { Write-Host($rsPodStatus) Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent replicaset pod running OK.") -ForegroundColor Green @@ -401,7 +401,7 @@ try { catch { Write-Host ("Failed to get omsagent replicatset pod info using kubectl get rs : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent daemonset pod running correctly ...") @@ -410,7 +410,7 @@ try { if ($ds.Items.Length -ne 1) { Write-Host( "omsagent replicaset pod not scheduled or failed to schedule." + $contactUSMessage) -ForegroundColor Red Stop-Transcript - exit + exit 1 } $dsStatus = $ds.Items[0].status @@ -424,7 +424,7 @@ try { Write-Host($rsPodStatus) Write-Host($contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent daemonset pod running OK.") -ForegroundColor Green @@ -432,7 +432,7 @@ try { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the omsagent heatlhservice running correctly ...") @@ -441,7 +441,7 @@ try { if ($healthservice.Items.Length -ne 1) { Write-Host( "omsagent healthservice not scheduled or failed to schedule." + $contactUSMessage) Stop-Transcript - exit + exit 1 } Write-Host( "omsagent healthservice pod running OK.") -ForegroundColor Green @@ -449,7 +449,7 @@ try { catch { Write-Host ("Failed to execute kubectl get services command : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Retrieving WorkspaceGUID and WorkspacePrimaryKey of the workspace : " + $WorkspaceInformation.Name) @@ -462,7 +462,7 @@ try { catch { Write-Host ("Failed to workspace details. Please validate whether you have Log Analytics Contributor role on the workspace error: '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking whether the WorkspaceGuid and key matching with configured log analytics workspace ...") @@ -473,7 +473,7 @@ try { if ((($workspaceGuidConfiguredOnAgent -eq $workspaceGUID) -and ($workspaceKeyConfiguredOnAgent -eq $workspacePrimarySharedKey)) -eq $false) { Write-Host ("Error - Log Analytics Workspace Guid and key configured on the agent not matching with details of the Workspace. Please verify and fix with the correct workspace Guid and Key") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Workspace Guid and Key on the agent matching with the Workspace") -ForegroundColor Green @@ -481,7 +481,7 @@ try { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("Checking agent version...") @@ -497,7 +497,7 @@ try { catch { Write-Host ("Failed to execute the script : '" + $Error[0] + "' ") -ForegroundColor Red Stop-Transcript - exit + exit 1 } Write-Host("resetting cluster context back, what it was before") diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index fbee1dd75..407ab3611 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -964,7 +964,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if er != nil { Log("Error::mdsd::Failed to write to mdsd %d records after %s. Will retry ... error : %s", len(msgPackEntries), elapsed, er.Error()) - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0, 0) if MdsdInsightsMetricsMsgpUnixSocketClient != nil { MdsdInsightsMetricsMsgpUnixSocketClient.Close() MdsdInsightsMetricsMsgpUnixSocketClient = nil @@ -976,7 +976,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int return output.FLB_RETRY } else { numTelegrafMetricsRecords := len(msgPackEntries) - UpdateNumTelegrafMetricsSentTelemetry(numTelegrafMetricsRecords, 0, 0) + UpdateNumTelegrafMetricsSentTelemetry(numTelegrafMetricsRecords, 0, 0, 0) Log("Success::mdsd::Successfully flushed %d telegraf metrics records that was %d bytes to mdsd in %s ", numTelegrafMetricsRecords, bts, elapsed) } } @@ -985,9 +985,13 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int var metrics []laTelegrafMetric var i int + numWinMetricsWithTagsSize64KBorMore := 0 for i = 0; i < len(laMetrics); i++ { metrics = append(metrics, *laMetrics[i]) + if len(*&laMetrics[i].Tags) >= (64 * 1024) { + numWinMetricsWithTagsSize64KBorMore += 1 + } } laTelegrafMetrics := InsightsMetricsBlob{ @@ -1039,7 +1043,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if err != nil { message := fmt.Sprintf("PostTelegrafMetricsToLA::Error:(retriable) when sending %v metrics. duration:%v err:%q \n", len(laMetrics), elapsed, err.Error()) Log(message) - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 0, 0) return output.FLB_RETRY } @@ -1048,7 +1052,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int Log("PostTelegrafMetricsToLA::Error:(retriable) RequestID %s Response Status %v Status Code %v", reqID, resp.Status, resp.StatusCode) } if resp != nil && resp.StatusCode == 429 { - UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1) + UpdateNumTelegrafMetricsSentTelemetry(0, 1, 1, 0) } return output.FLB_RETRY } @@ -1056,18 +1060,19 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int defer resp.Body.Close() numMetrics := len(laMetrics) - UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0) + UpdateNumTelegrafMetricsSentTelemetry(numMetrics, 0, 0, numWinMetricsWithTagsSize64KBorMore) Log("PostTelegrafMetricsToLA::Info:Successfully flushed %v records in %v", numMetrics, elapsed) } return output.FLB_OK } -func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int, numSend429Errors int) { +func UpdateNumTelegrafMetricsSentTelemetry(numMetricsSent int, numSendErrors int, numSend429Errors int, numWinMetricswith64KBorMoreSize int) { ContainerLogTelemetryMutex.Lock() TelegrafMetricsSentCount += float64(numMetricsSent) TelegrafMetricsSendErrorCount += float64(numSendErrors) TelegrafMetricsSend429ErrorCount += float64(numSend429Errors) + WinTelegrafMetricsCountWithTagsSize64KBorMore += float64(numWinMetricswith64KBorMoreSize) ContainerLogTelemetryMutex.Unlock() } diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index b344f4ac8..b4f8ab89d 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -32,6 +32,8 @@ var ( TelemetryClient appinsights.TelemetryClient // ContainerLogTelemetryTicker sends telemetry periodically ContainerLogTelemetryTicker *time.Ticker + //Tracks the number of windows telegraf metrics count with Tags size 64KB or more between telemetry ticker periods (uses ContainerLogTelemetryTicker) + WinTelegrafMetricsCountWithTagsSize64KBorMore float64 //Tracks the number of telegraf metrics sent successfully between telemetry ticker periods (uses ContainerLogTelemetryTicker) TelegrafMetricsSentCount float64 //Tracks the number of send errors between telemetry ticker periods (uses ContainerLogTelemetryTicker) @@ -78,6 +80,7 @@ const ( metricNameNumberofTelegrafMetricsSentSuccessfully = "TelegrafMetricsSentCount" metricNameNumberofSendErrorsTelegrafMetrics = "TelegrafMetricsSendErrorCount" metricNameNumberofSend429ErrorsTelegrafMetrics = "TelegrafMetricsSend429ErrorCount" + metricNameNumberofWinTelegrafMetricsWithTagsSize64KBorMore = "WinTelegrafMetricsCountWithTagsSize64KBorMore" metricNameErrorCountContainerLogsSendErrorsToMDSDFromFluent = "ContainerLogs2MdsdSendErrorCount" metricNameErrorCountContainerLogsMDSDClientCreateError = "ContainerLogsMdsdClientCreateErrorCount" metricNameErrorCountInsightsMetricsMDSDClientCreateError = "InsightsMetricsMDSDClientCreateErrorsCount" @@ -117,6 +120,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { telegrafMetricsSentCount := TelegrafMetricsSentCount telegrafMetricsSendErrorCount := TelegrafMetricsSendErrorCount telegrafMetricsSend429ErrorCount := TelegrafMetricsSend429ErrorCount + winTelegrafMetricsCountWithTagsSize64KBorMore := WinTelegrafMetricsCountWithTagsSize64KBorMore containerLogsSendErrorsToMDSDFromFluent := ContainerLogsSendErrorsToMDSDFromFluent containerLogsMDSDClientCreateErrors := ContainerLogsMDSDClientCreateErrors containerLogsSendErrorsToADXFromFluent := ContainerLogsSendErrorsToADXFromFluent @@ -133,6 +137,7 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { TelegrafMetricsSentCount = 0.0 TelegrafMetricsSendErrorCount = 0.0 TelegrafMetricsSend429ErrorCount = 0.0 + WinTelegrafMetricsCountWithTagsSize64KBorMore = 0.0 FlushedRecordsCount = 0.0 FlushedRecordsSize = 0.0 FlushedRecordsTimeTaken = 0.0 @@ -227,6 +232,9 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { if kubeMonEventsMDSDClientCreateErrors > 0.0 { TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameErrorCountKubeMonEventsMDSDClientCreateError, kubeMonEventsMDSDClientCreateErrors)) } + if winTelegrafMetricsCountWithTagsSize64KBorMore > 0.0 { + TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameNumberofWinTelegrafMetricsWithTagsSize64KBorMore, winTelegrafMetricsCountWithTagsSize64KBorMore)) + } if ContainerLogRecordCountWithEmptyTimeStamp > 0.0 { TelemetryClient.Track(appinsights.NewMetricTelemetry(metricNameContainerLogRecordCountWithEmptyTimeStamp, containerLogRecordCountWithEmptyTimeStamp)) } diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 69da56488..b9516c2ce 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -110,7 +110,7 @@ class Constants CONTAINER_INVENTORY_DATA_TYPE = "CONTAINER_INVENTORY_BLOB" CONTAINER_NODE_INVENTORY_DATA_TYPE = "CONTAINER_NODE_INVENTORY_BLOB" PERF_DATA_TYPE = "LINUX_PERF_BLOB" - INSIGHTS_METRICS_DATA_TYPE = "INSIGHTS_METRICS_BLOB" + INSIGHTS_METRICS_DATA_TYPE = "INSIGHTS_METRICS_BLOB" KUBE_SERVICES_DATA_TYPE = "KUBE_SERVICES_BLOB" KUBE_POD_INVENTORY_DATA_TYPE = "KUBE_POD_INVENTORY_BLOB" KUBE_NODE_INVENTORY_DATA_TYPE = "KUBE_NODE_INVENTORY_BLOB" @@ -119,17 +119,21 @@ class Constants KUBE_MON_AGENT_EVENTS_DATA_TYPE = "KUBE_MON_AGENT_EVENTS_BLOB" KUBE_HEALTH_DATA_TYPE = "KUBE_HEALTH_BLOB" CONTAINERLOGV2_DATA_TYPE = "CONTAINERINSIGHTS_CONTAINERLOGV2" - CONTAINERLOG_DATA_TYPE = "CONTAINER_LOG_BLOB" + CONTAINERLOG_DATA_TYPE = "CONTAINER_LOG_BLOB" #ContainerInsights Extension (AMCS) CI_EXTENSION_NAME = "ContainerInsights" - CI_EXTENSION_VERSION = "1" + CI_EXTENSION_VERSION = "1" #Current CI extension config size is ~5KB and going with 20KB to handle any future scenarios CI_EXTENSION_CONFIG_MAX_BYTES = 20480 - ONEAGENT_FLUENT_SOCKET_NAME = "/var/run/mdsd/default_fluent.socket" + ONEAGENT_FLUENT_SOCKET_NAME = "/var/run/mdsd/default_fluent.socket" #Tag prefix for output stream EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX = "dcr-" LINUX_LOG_PATH = $in_unit_test.nil? ? "/var/opt/microsoft/docker-cimprov/log/" : "./" WINDOWS_LOG_PATH = $in_unit_test.nil? ? "/etc/omsagentwindows/" : "./" + + #This is for telemetry to track if any of the windows customer has any of the field size >= 64KB + #To evaluate switching to Windows AMA 64KB impacts any existing customers + MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536 end diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index 862e88e44..aba24ecc2 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -64,12 +64,12 @@ def enumerate() begin eventStream = Fluent::MultiEventStream.new insightsMetricsEventStream = Fluent::MultiEventStream.new - metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) + metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime) metricData.each do |record| eventStream.add(time, record) if record end - if ExtensionUtils.isAADMSIAuthMode() + if ExtensionUtils.isAADMSIAuthMode() && !@@isWindows.nil? && @@isWindows == false $log.info("in_cadvisor_perf::enumerate: AAD AUTH MSI MODE") if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) @@ -77,7 +77,7 @@ def enumerate() if @insightsmetricstag.nil? || !@insightsmetricstag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @insightsmetricstag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) end - $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") $log.info("in_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsmetricstag} @ #{Time.now.utc.iso8601}") end router.emit_stream(@tag, eventStream) if eventStream @@ -95,9 +95,9 @@ def enumerate() containerGPUusageInsightsMetricsDataItems = [] containerGPUusageInsightsMetricsDataItems.concat(CAdvisorMetricsAPIClient.getInsightsMetrics(winNode: nil, metricTime: batchTime)) - containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| - insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord - end + containerGPUusageInsightsMetricsDataItems.each do |insightsMetricsRecord| + insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord + end router.emit_stream(@insightsmetricstag, insightsMetricsEventStream) if insightsMetricsEventStream router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index a32a32769..abbfe94a1 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -1,17 +1,17 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - def initialize (kubernetesApiClient=nil, - applicationInsightsUtility=nil, - extensionUtils=nil, - env=nil, - telemetry_flush_interval=nil) + def initialize(kubernetesApiClient = nil, + applicationInsightsUtility = nil, + extensionUtils = nil, + env = nil, + telemetry_flush_interval = nil) super() require "yaml" @@ -36,8 +36,7 @@ def initialize (kubernetesApiClient=nil, @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - - + @@rsPromInterval = @env["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = @env["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@rsPromFieldDropCount = @env["TELEMETRY_RS_PROM_FIELDDROP_LENGTH"] @@ -119,6 +118,7 @@ def enumerate nodeInventory = nil currentTime = Time.now batchTime = currentTime.utc.iso8601 + nodeCount = 0 @nodesAPIE2ELatencyMs = 0 @nodeInventoryE2EProcessingLatencyMs = 0 @@ -138,7 +138,7 @@ def enumerate if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = @extensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) end - $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") @@ -155,6 +155,7 @@ def enumerate nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + nodeCount += nodeInventory["items"].length $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else @@ -168,6 +169,7 @@ def enumerate nodesAPIChunkEndTime = (Time.now.to_f * 1000).to_i @nodesAPIE2ELatencyMs = @nodesAPIE2ELatencyMs + (nodesAPIChunkEndTime - nodesAPIChunkStartTime) if (!nodeInventory.nil? && !nodeInventory.empty? && nodeInventory.key?("items") && !nodeInventory["items"].nil? && !nodeInventory["items"].empty?) + nodeCount += nodeInventory["items"].length $log.info("in_kube_nodes::enumerate : number of node items :#{nodeInventory["items"].length} from Kube API @ #{Time.now.utc.iso8601}") parse_and_emit_records(nodeInventory, batchTime) else @@ -181,6 +183,7 @@ def enumerate if (timeDifferenceInMinutes >= @TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) @applicationInsightsUtility.sendMetricTelemetry("NodeInventoryE2EProcessingLatencyMs", @nodeInventoryE2EProcessingLatencyMs, {}) @applicationInsightsUtility.sendMetricTelemetry("NodesAPIE2ELatencyMs", @nodesAPIE2ELatencyMs, {}) + @applicationInsightsUtility.sendMetricTelemetry("NodeCount", nodeCount, {}) @@nodeInventoryLatencyTelemetryTimeTracker = DateTime.now.to_time.to_i end # Setting this to nil so that we dont hold memory until GC kicks in @@ -208,9 +211,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream - $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of mdm node inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@MDMKubeNodeInventoryTag, eventStream) if eventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -223,7 +226,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@ContainerNodeInventoryTag, containerNodeInventoryEventStream) if containerNodeInventoryEventStream containerNodeInventoryEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -272,7 +275,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::parse_and_emit_records: number of node perf metric records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream kubePerfEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -302,7 +305,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::parse_and_emit_records: number of GPU node perf metric records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@insightsMetricsTag, insightsMetricsEventStream) if insightsMetricsEventStream insightsMetricsEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) @@ -572,12 +575,12 @@ def getNodeTelemetryProps(item) return properties end end # Kube_Node_Input + class NodeStatsCache # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) # (to reduce code duplication) class NodeCache - - @@RECORD_TIME_TO_LIVE = 60*20 # units are seconds, so clear the cache every 20 minutes. + @@RECORD_TIME_TO_LIVE = 60 * 20 # units are seconds, so clear the cache every 20 minutes. def initialize @cacheHash = {} @@ -622,7 +625,7 @@ def clean_cache() end end - nodes_to_remove.each {|node_name| + nodes_to_remove.each { |node_name| @cacheHash.delete(node_name) @timeAdded.delete(node_name) } @@ -630,7 +633,6 @@ def clean_cache() end end # NodeCache - @@cpuCache = NodeCache.new @@memCache = NodeCache.new diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 3f5f4f1cc..f979ef7c5 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/input' +require "fluent/plugin/input" module Fluent::Plugin require_relative "podinventory_to_mdm" @@ -12,7 +12,6 @@ class Kube_PodInventory_Input < Input @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) - def initialize super require "yaml" @@ -35,9 +34,16 @@ def initialize @PODS_EMIT_STREAM_BATCH_SIZE = 0 @podCount = 0 + @containerCount = 0 @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 + @windowsNodeCount = 0 + @winContainerInventoryTotalSizeBytes = 0 + @winContainerCountWithInventoryRecordSize64KBOrMore = 0 + @winContainerCountWithEnvVarSize64KBOrMore = 0 + @winContainerCountWithPortsSize64KBOrMore = 0 + @winContainerCountWithCommandSize64KBOrMore = 0 @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 @podsAPIE2ELatencyMs = 0 @@ -100,9 +106,16 @@ def enumerate(podList = nil) podInventory = podList telemetryFlush = false @podCount = 0 + @containerCount = 0 @serviceCount = 0 @controllerSet = Set.new [] @winContainerCount = 0 + @winContainerInventoryTotalSizeBytes = 0 + @winContainerCountWithInventoryRecordSize64KBOrMore = 0 + @winContainerCountWithEnvVarSize64KBOrMore = 0 + @winContainerCountWithPortsSize64KBOrMore = 0 + @winContainerCountWithCommandSize64KBOrMore = 0 + @windowsNodeCount = 0 @controllerData = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 @@ -110,27 +123,27 @@ def enumerate(podList = nil) @podInventoryE2EProcessingLatencyMs = 0 podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") - if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end - if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) - end - if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) - end - if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) - @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) - end - $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) + end + if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") end # Get services first so that we dont need to make a call for very chunk @@ -202,11 +215,24 @@ def enumerate(podList = nil) telemetryProperties["PODS_EMIT_STREAM_BATCH_SIZE"] = @PODS_EMIT_STREAM_BATCH_SIZE ApplicationInsightsUtility.sendCustomEvent("KubePodInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PodCount", @podCount, {}) + ApplicationInsightsUtility.sendMetricTelemetry("ContainerCount", @containerCount, {}) ApplicationInsightsUtility.sendMetricTelemetry("ServiceCount", @serviceCount, {}) telemetryProperties["ControllerData"] = @controllerData.to_json ApplicationInsightsUtility.sendMetricTelemetry("ControllerCount", @controllerSet.length, telemetryProperties) if @winContainerCount > 0 telemetryProperties["ClusterWideWindowsContainersCount"] = @winContainerCount + telemetryProperties["WindowsNodeCount"] = @windowsNodeCount + telemetryProperties["ClusterWideWindowsContainerInventoryTotalSizeKB"] = @winContainerInventoryTotalSizeBytes / 1024 + telemetryProperties["WindowsContainerCountWithInventoryRecordSize64KBorMore"] = @winContainerCountWithInventoryRecordSize64KBOrMore + if @winContainerCountWithEnvVarSize64KBOrMore > 0 + telemetryProperties["WinContainerCountWithEnvVarSize64KBOrMore"] = @winContainerCountWithEnvVarSize64KBOrMore + end + if @winContainerCountWithPortsSize64KBOrMore > 0 + telemetryProperties["WinContainerCountWithPortsSize64KBOrMore"] = @winContainerCountWithPortsSize64KBOrMore + end + if @winContainerCountWithCommandSize64KBOrMore > 0 + telemetryProperties["WinContainerCountWithCommandSize64KBOrMore"] = @winContainerCountWithCommandSize64KBOrMore + end ApplicationInsightsUtility.sendCustomEvent("WindowsContainerInventoryEvent", telemetryProperties) end ApplicationInsightsUtility.sendMetricTelemetry("PodInventoryE2EProcessingLatencyMs", @podInventoryE2EProcessingLatencyMs, telemetryProperties) @@ -236,6 +262,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventory["items"].each do |item| #podInventory block start # pod inventory records podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) + @containerCount += podInventoryRecords.length podInventoryRecords.each do |record| if !record.nil? eventStream.add(emitTime, record) if record @@ -249,6 +276,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if !item["spec"]["nodeName"].nil? nodeName = item["spec"]["nodeName"] end + @windowsNodeCount = winNodes.length if (!nodeName.empty? && (winNodes.include? nodeName)) clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] #Generate ContainerInventory records for windows nodes so that we can get image and image tag in property panel @@ -258,13 +286,27 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerInventoryRecords.each do |cirecord| if !cirecord.nil? containerInventoryStream.add(emitTime, cirecord) if cirecord + ciRecordSize = cirecord.to_s.length + @winContainerInventoryTotalSizeBytes += ciRecordSize + if ciRecordSize >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithInventoryRecordSize64KBOrMore += 1 + end + if !cirecord["EnvironmentVar"].nil? && !cirecord["EnvironmentVar"].empty? && cirecord["EnvironmentVar"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithEnvVarSize64KBOrMore += 1 + end + if !cirecord["Ports"].nil? && !cirecord["Ports"].empty? && cirecord["Ports"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithPortsSize64KBOrMore += 1 + end + if !cirecord["Command"].nil? && !cirecord["Command"].empty? && cirecord["Command"].length >= Constants::MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY + @winContainerCountWithCommandSize64KBOrMore += 1 + end end end end end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of pod inventory records emitted #{eventStream.count} @ #{Time.now.utc.iso8601}") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -284,7 +326,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of container perf records emitted #{kubePerfEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeperfTag, kubePerfEventStream) if kubePerfEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeContainerPerfEventEmitStreamSuccess @ #{Time.now.utc.iso8601}") @@ -303,7 +345,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc end if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of GPU insights metrics records emitted #{insightsMetricsEventStream.count} @ #{Time.now.utc.iso8601}") if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubePodInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end @@ -368,7 +410,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName kubeServicesEventStream.add(emitTime, kubeServiceRecord) if kubeServiceRecord if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE - $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{kubeServicesEventStream.count} @ #{Time.now.utc.iso8601}") router.emit_stream(@kubeservicesTag, kubeServicesEventStream) if kubeServicesEventStream kubeServicesEventStream = Fluent::MultiEventStream.new if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) From bfc41a4f12a5972dc0a74dfabee22c561dadd71f Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 28 Jan 2022 17:33:56 -0800 Subject: [PATCH 191/194] Gangams/jan 2022 release tasks 2 (#701) * mdsd proc cpu and memory telemetry * write ai logs to file and telemetry for mdsd proc * write ai logs to file and telemetry for mdsd proc * write ai logs to file and telemetry for mdsd proc * fix pr feedback * use name_prefix * remove mdsd telemetry changes * remove mdsd telemetry changes * remove mdsd telemetry changes --- .../conf/telegraf-prom-side-car.conf | 53 ++++++++++++++++++- build/linux/installer/conf/telegraf-rs.conf | 52 ++++++++++++++++-- build/linux/installer/conf/telegraf.conf | 38 ++++++++++--- .../installer/datafiles/base_container.data | 4 +- .../ruby/ApplicationInsightsUtility.rb | 15 ++++-- .../channel/asynchronous_sender.rb | 13 ++--- .../channel/sender_base.rb | 31 ++++++----- .../channel/synchronous_sender.rb | 5 +- 8 files changed, 173 insertions(+), 38 deletions(-) diff --git a/build/linux/installer/conf/telegraf-prom-side-car.conf b/build/linux/installer/conf/telegraf-prom-side-car.conf index 1b6bab9f9..a94150fad 100644 --- a/build/linux/installer/conf/telegraf-prom-side-car.conf +++ b/build/linux/installer/conf/telegraf-prom-side-car.conf @@ -111,6 +111,26 @@ data_format = "json" namedrop = ["agent_telemetry", "file"] +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + +# ## Timeout for closing (default: 5s). +# # timeout = "5s" + +# ## Enable additional diagnostic logging. +# enable_diagnostic_logging = false + +# ## Context Tag Sources add Application Insights context tags to a tag value. +# ## +# ## For list of allowed context tag keys see: +# ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go +# # [outputs.application_insights.context_tag_sources] +# # "ai.cloud.role" = "kubernetes_container_name" +# # "ai.cloud.roleInstance" = "kubernetes_pod_name" +# namepass = ["t.azm.ms/agent_telemetry"] + #tagdrop = ["nodeName"] + ############################################################################### # PROCESSOR PLUGINS # ############################################################################### @@ -119,9 +139,23 @@ [processors.converter.fields] float = ["*"] +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### +# [[aggregators.quantile]] +# period = "30m" +# drop_original = true +# quantiles = [0.95] +# algorithm = "t-digest" +# compression = 100.0 +# namepass = ["agent_telemetry"] + +############################################################################### +# INPUT PLUGINS # +############################################################################### # Dummy plugin to test out toml parsing happens properly [[inputs.file]] - interval = "24h" + interval = "24h" files = ["test.json"] data_format = "json" @@ -166,3 +200,20 @@ $AZMON_TELEGRAF_CUSTOM_PROM_PLUGINS_WITH_NAMESPACE_FILTER ## OSM Prometheus configuration $AZMON_TELEGRAF_OSM_PROM_PLUGINS + +# [[inputs.procstat]] +# name_prefix="t.azm.ms/" +# exe = "mdsd" +# interval = "60s" +# pid_finder = "native" +# pid_tag = true +# name_override = "agent_telemetry" +# fieldpass = ["cpu_usage", "memory_rss"] +# [inputs.procstat.tags] +# Computer = "$NODE_NAME" +# AgentVersion = "$AGENT_VERSION" +# ControllerType = "$CONTROLLER_TYPE" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACSResourceName = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" +# ContainerType = "$CONTAINER_TYPE" diff --git a/build/linux/installer/conf/telegraf-rs.conf b/build/linux/installer/conf/telegraf-rs.conf index 5de35d82c..72fc25451 100644 --- a/build/linux/installer/conf/telegraf-rs.conf +++ b/build/linux/installer/conf/telegraf-rs.conf @@ -124,6 +124,26 @@ namedrop = ["agent_telemetry", "file"] #tagdrop = ["AgentVersion","AKS_RESOURCE_ID", "ACS_RESOURCE_NAME", "Region","ClusterName","ClusterType", "Computer", "ControllerType"] +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + +# ## Timeout for closing (default: 5s). +# # timeout = "5s" + +# ## Enable additional diagnostic logging. +# enable_diagnostic_logging = false + +# ## Context Tag Sources add Application Insights context tags to a tag value. +# ## +# ## For list of allowed context tag keys see: +# ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go +# # [outputs.application_insights.context_tag_sources] +# # "ai.cloud.role" = "kubernetes_container_name" +# # "ai.cloud.roleInstance" = "kubernetes_pod_name" +# namepass = ["t.azm.ms/agent_telemetry"] + #tagdrop = ["nodeName"] + ############################################################################### # PROCESSOR PLUGINS # ############################################################################### @@ -293,6 +313,13 @@ ############################################################################### # AGGREGATOR PLUGINS # ############################################################################### +# [[aggregators.quantile]] +# period = "30m" +# drop_original = true +# quantiles = [0.95] +# algorithm = "t-digest" +# compression = 100.0 +# namepass = ["agent_telemetry"] # # Keep the aggregate basicstats of each metric passing through. # [[aggregators.basicstats]] @@ -369,7 +396,7 @@ # report_active = true # fieldpass = ["usage_active","cluster","node","host","device"] # taginclude = ["cluster","cpu","node"] - + # Read metrics about disk usage by mount point @@ -377,7 +404,7 @@ ## By default stats will be gathered for all mount points. ## Set mount_points will restrict the stats to only the specified mount points. # mount_points = ["/"] - + ## Ignore mount points by filesystem type. # ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] # fieldpass = ["free", "used", "used_percent"] @@ -520,7 +547,7 @@ # Dummy plugin to test out toml parsing happens properly [[inputs.file]] - interval = "24h" + interval = "24h" files = ["test.json"] data_format = "json" @@ -530,10 +557,10 @@ ## An array of urls to scrape metrics from. urls = $AZMON_TELEGRAF_CUSTOM_PROM_URLS - + ## An array of Kubernetes services to scrape metrics from. kubernetes_services = $AZMON_TELEGRAF_CUSTOM_PROM_K8S_SERVICES - + ## Scrape Kubernetes pods for the following prometheus annotations: ## - prometheus.io/scrape: Enable scraping for this pod ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to @@ -648,3 +675,18 @@ $AZMON_TELEGRAF_OSM_PROM_PLUGINS #[inputs.prometheus.tagpass] # operation_type = ["create_container", "remove_container", "pull_image"] +# [[inputs.procstat]] +# name_prefix="t.azm.ms/" +# exe = "mdsd" +# interval = "60s" +# pid_finder = "native" +# pid_tag = true +# name_override = "agent_telemetry" +# fieldpass = ["cpu_usage", "memory_rss"] +# [inputs.procstat.tags] +# Computer = "$NODE_NAME" +# AgentVersion = "$AGENT_VERSION" +# ControllerType = "$CONTROLLER_TYPE" +# AKS_RESOURCE_ID = "$TELEMETRY_AKS_RESOURCE_ID" +# ACSResourceName = "$TELEMETRY_ACS_RESOURCE_NAME" +# Region = "$TELEMETRY_AKS_REGION" diff --git a/build/linux/installer/conf/telegraf.conf b/build/linux/installer/conf/telegraf.conf index b0a8730c6..9f213e3e8 100644 --- a/build/linux/installer/conf/telegraf.conf +++ b/build/linux/installer/conf/telegraf.conf @@ -158,6 +158,26 @@ namepass = ["container.azm.ms/disk"] #fieldpass = ["used_percent"] +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" + +# ## Timeout for closing (default: 5s). +# # timeout = "5s" + +# ## Enable additional diagnostic logging. +# enable_diagnostic_logging = false + + ## Context Tag Sources add Application Insights context tags to a tag value. + ## + ## For list of allowed context tag keys see: + ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go + # [outputs.application_insights.context_tag_sources] + # "ai.cloud.role" = "kubernetes_container_name" + # "ai.cloud.roleInstance" = "kubernetes_pod_name" + # namepass = ["agent_telemetry"] + #tagdrop = ["nodeName"] + ############################################################################### # PROCESSOR PLUGINS # ############################################################################### @@ -328,7 +348,13 @@ ############################################################################### # AGGREGATOR PLUGINS # ############################################################################### - +# [[aggregators.quantile]] +# period = "30m" +# drop_original = true +# quantiles = [0.95] +# algorithm = "t-digest" +# compression = 100.0 +# namepass = ["t.azm.ms/agent_telemetry"] # # Keep the aggregate basicstats of each metric passing through. # [[aggregators.basicstats]] # ## General Aggregator Arguments: @@ -407,7 +433,7 @@ # Dummy plugin to test out toml parsing happens properly [[inputs.file]] - interval = "24h" + interval = "24h" files = ["test.json"] data_format = "json" @@ -550,14 +576,14 @@ #fieldpass = ["numContainers", "numContainersRunning", "numContainersStopped", "numContainersPaused", "numContainerImages"] # taginclude = ["nodeName"] -#[[inputs.procstat]] -# #name_prefix="t.azm.ms/" +# [[inputs.procstat]] +# name_prefix="t.azm.ms/" # exe = "mdsd" -# interval = "10s" +# interval = "60s" # pid_finder = "native" # pid_tag = true # name_override = "agent_telemetry" -# fieldpass = ["cpu_usage", "memory_rss", "memory_swap", "memory_vms", "memory_stack"] +# fieldpass = ["cpu_usage", "memory_rss"] # [inputs.procstat.tags] # Computer = "$NODE_NAME" # AgentVersion = "$AGENT_VERSION" diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 985c73a17..9fc7ce08f 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -286,6 +286,8 @@ chmod 666 /var/opt/microsoft/docker-cimprov/log/fluent_forward_failed.log touch /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log chmod 666 /var/opt/microsoft/docker-cimprov/log/arc_k8s_cluster_identity.log +touch /var/opt/microsoft/docker-cimprov/log/appinsights_error.log +chmod 666 /var/opt/microsoft/docker-cimprov/log/appinsights_error.log touch /var/opt/microsoft/docker-cimprov/log/fluentd.log chmod 666 /var/opt/microsoft/docker-cimprov/log/fluentd.log @@ -309,7 +311,7 @@ if ${{PERFORMING_UPGRADE_NOT}}; then rmdir /etc/opt/microsoft/docker-cimprov/conf 2> /dev/null rmdir /etc/opt/microsoft/docker-cimprov 2> /dev/null rmdir /etc/opt/microsoft 2> /dev/null - rmdir /etc/opt 2> /dev/null + rmdir /etc/opt 2> /dev/null fi %Preinstall_0 diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index 7691304a6..eb143c4ba 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -98,6 +98,13 @@ def initializeUtility() elsif !encodedAppInsightsKey.nil? decodedAppInsightsKey = Base64.decode64(encodedAppInsightsKey) + if @@isWindows + logPath = "/etc/omsagentwindows/appinsights_error.log" + else + logPath = "/var/opt/microsoft/docker-cimprov/log/appinsights_error.log" + end + aiLogger = Logger.new(logPath, 1, 2 * 1024 * 1024) + #override ai endpoint if its available otherwise use default. if appInsightsEndpoint && !appInsightsEndpoint.nil? && !appInsightsEndpoint.empty? $log.info("AppInsightsUtility: Telemetry client uses overrided endpoint url : #{appInsightsEndpoint}") @@ -105,20 +112,20 @@ def initializeUtility() #telemetrySynchronousQueue = ApplicationInsights::Channel::SynchronousQueue.new(telemetrySynchronousSender) #telemetryChannel = ApplicationInsights::Channel::TelemetryChannel.new nil, telemetrySynchronousQueue if !isProxyConfigured - sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint + sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint, aiLogger else $log.info("AppInsightsUtility: Telemetry client uses provided proxy configuration since proxy configured") - sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint, @@proxy + sender = ApplicationInsights::Channel::AsynchronousSender.new appInsightsEndpoint, aiLogger, @@proxy end queue = ApplicationInsights::Channel::AsynchronousQueue.new sender channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue @@Tc = ApplicationInsights::TelemetryClient.new decodedAppInsightsKey, channel else if !isProxyConfigured - sender = ApplicationInsights::Channel::AsynchronousSender.new + sender = ApplicationInsights::Channel::AsynchronousSender.new nil, aiLogger else $log.info("AppInsightsUtility: Telemetry client uses provided proxy configuration since proxy configured") - sender = ApplicationInsights::Channel::AsynchronousSender.new nil, @@proxy + sender = ApplicationInsights::Channel::AsynchronousSender.new nil, aiLogger, @@proxy end queue = ApplicationInsights::Channel::AsynchronousQueue.new sender channel = ApplicationInsights::Channel::TelemetryChannel.new nil, queue diff --git a/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb b/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb index 4786aa1d9..df2138b3a 100644 --- a/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb +++ b/source/plugins/ruby/lib/application_insights/channel/asynchronous_sender.rb @@ -1,5 +1,5 @@ -require_relative 'sender_base' -require 'thread' +require_relative "sender_base" +require "thread" module ApplicationInsights module Channel @@ -17,12 +17,13 @@ module Channel # If no queue items are found for {#send_time} seconds, the worker thread # will shut down (and {#start} will need to be called again). class AsynchronousSender < SenderBase - SERVICE_ENDPOINT_URI = 'https://dc.services.visualstudio.com/v2/track' + SERVICE_ENDPOINT_URI = "https://dc.services.visualstudio.com/v2/track" # Initializes a new instance of the class. # @param [String] service_endpoint_uri the address of the service to send + # @param [Logger] instance of the logger to write the logs (optional) # @param [Hash] proxy server configuration to send (optional) # telemetry data to. - def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {}) + def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, logger = nil, proxy = {}) # callers which requires proxy dont require to maintain service endpoint uri which potentially can change if service_endpoint_uri.nil? || service_endpoint_uri.empty? service_endpoint_uri = SERVICE_ENDPOINT_URI @@ -33,7 +34,7 @@ def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {}) @lock_work_thread = Mutex.new @work_thread = nil @start_notification_processed = true - super service_endpoint_uri, proxy + super service_endpoint_uri, logger, proxy end # The time span in seconds at which the the worker thread will check the @@ -130,7 +131,7 @@ def run rescue Exception => e # Make sure work_thread sets to nil when it terminates abnormally @work_thread = nil - @logger.error('application_insights') { "Asynchronous sender work thread terminated abnormally: #{e.to_s}" } + @logger.error("application_insights") { "Asynchronous sender work thread terminated abnormally: #{e.to_s}" } end end end diff --git a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb index bedbae4ee..a6b7966db 100644 --- a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb +++ b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb @@ -1,9 +1,9 @@ -require 'yajl/json_gem' -require 'net/http' -require 'openssl' -require 'stringio' -require 'zlib' -require 'logger' +require "yajl/json_gem" +require "net/http" +require "openssl" +require "stringio" +require "zlib" +require "logger" module ApplicationInsights module Channel @@ -16,13 +16,18 @@ module Channel class SenderBase # Initializes a new instance of the class. # @param [String] service_endpoint_uri the address of the service to send + # @param [Logger] instance of the logger to write the logs # @param [Hash] proxy server configuration to send (optional) # telemetry data to. - def initialize(service_endpoint_uri, proxy = {}) + def initialize(service_endpoint_uri, logger, proxy = {}) @service_endpoint_uri = service_endpoint_uri @queue = nil @send_buffer_size = 100 - @logger = Logger.new(STDOUT) + if !logger.nil? && !logger.empty? + @logger = logger + else + @logger = Logger.new(STDOUT) + end @proxy = proxy end @@ -53,9 +58,9 @@ def initialize(service_endpoint_uri, proxy = {}) def send(data_to_send) uri = URI(@service_endpoint_uri) headers = { - 'Accept' => 'application/json', - 'Content-Type' => 'application/json; charset=utf-8', - 'Content-Encoding' => 'gzip' + "Accept" => "application/json", + "Content-Type" => "application/json; charset=utf-8", + "Content-Encoding" => "gzip", } request = Net::HTTP::Post.new(uri.path, headers) @@ -69,7 +74,7 @@ def send(data_to_send) else http = Net::HTTP.new(uri.hostname, uri.port, @proxy[:addr], @proxy[:port], @proxy[:user], @proxy[:pass]) end - if uri.scheme.downcase == 'https' + if uri.scheme.downcase == "https" http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_PEER end @@ -78,7 +83,7 @@ def send(data_to_send) http.finish if http.started? if !response.kind_of? Net::HTTPSuccess - @logger.warn('application_insights') { "Failed to send data: #{response.message}" } + @logger.warn("application_insights") { "Failed to send data: #{response.message}" } end end diff --git a/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb b/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb index 597e97b9e..2bb212026 100644 --- a/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb +++ b/source/plugins/ruby/lib/application_insights/channel/synchronous_sender.rb @@ -8,14 +8,15 @@ class SynchronousSender < SenderBase SERVICE_ENDPOINT_URI = "https://dc.services.visualstudio.com/v2/track" # Initializes a new instance of the class. # @param [String] service_endpoint_uri the address of the service to send + # @param [Logger] instance of the logger to write the logs (optional) # @param [Hash] proxy server configuration to send (optional) # telemetry data to. - def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, proxy = {}) + def initialize(service_endpoint_uri = SERVICE_ENDPOINT_URI, logger = nil, proxy = {}) # callers which requires proxy dont require to maintain service endpoint uri which potentially can change if service_endpoint_uri.nil? || service_endpoint_uri.empty? service_endpoint_uri = SERVICE_ENDPOINT_URI end - super service_endpoint_uri, proxy + super service_endpoint_uri, logger, proxy end end end From ec2b09f72843a65e5fc08ba3f1a42e4860ac46a7 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 31 Jan 2022 13:57:35 -0800 Subject: [PATCH 192/194] release updates for ciprod01312022 & win-ciprod01312022release (#707) * release updates for ciprod01312022 release * release updates for ciprod01312022 release * fix pr feedback --- ReleaseNotes.md | 35 ++++++++++++++++++++++ charts/azuremonitor-containers/values.yaml | 4 +-- kubernetes/linux/Dockerfile | 2 +- kubernetes/omsagent.yaml | 9 +++--- kubernetes/windows/Dockerfile | 2 +- 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index c8a147044..67f144608 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,41 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 1/31/2022 - +##### Version microsoft/oms:ciprod01312022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022 (linux) +##### Version microsoft/oms:win-ciprod01312022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01312022 (windows) +##### Code change log +- Linux Agent + - Configurable DB name via configmap for ADX (default DB name:containerinsights) + - Default to cAdvisor port to 10250 and container runtime to Containerd + - Update AgentVersion annotation in yamls (omsagent and chart) with released MDSD agent version + - Incresing windows agent CPU limits from 200m to 500m + - Ignore new disk path that comes from containerd starting with k8s version >= 1.19.x, which was adding unnecessary InsightsMetrics logs and increasing cost + - Route the AI SDK logs to log file instead of stdout + - Telemetry to collect ContainerLog Records with empty Timestamp + - FluentBit version upgrade from 1.6.8 to 1.7.8 +- Windows Agent + - Update to use FluentBit for container log collection and removed FluentD dependency for container log collection + - Telemetry to track if any of the variable fields of windows container inventory records has field size >= 64KB + - Add windows os check in in_cadvisor_perf plugin to avoid making call in MDSD in MSI auth mode + - Bug fix for placeholder_hostname in telegraf metrics + - FluentBit version upgrade from 1.4.0 to 1.7.8 +- Common + - Upgrade FluentD gem version from 1.12.2 to 1.14.2 + - Upgrade Telegraf version from 1.18.0 to 1.20.3 + - Fix for exception in node allocatable + - Telemetry to track nodeCount & containerCount +- Other changes + - Updates to Arc K8s Extension ARM Onboarding templates with GA API version + - Added ARM Templates for MSI Based Onboarding for AKS + - Conformance test updates relates to sidecar container + - Troubelshooting script to detect issues related to Arc K8s Extension onboarding + - Remove the dependency SP for CDPX since configured to use MSI + - Linux Agent Image build improvements + - Update msys2 version to fix windows agent build + - Add explicit exit code 1 across all the PS scripts + + ### 10/13/2021 - ##### Version microsoft/oms:ciprod10132021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021 (linux) ##### Version microsoft/oms:win-ciprod10132021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10132021 (windows) diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index d5d7ad2e1..0456eb625 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -21,8 +21,8 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod10132021" - tagWindows: "win-ciprod10132021" + tag: "ciprod01312022" + tagWindows: "win-ciprod01312022" pullPolicy: IfNotPresent dockerProviderVersion: "16.0.0-0" agentVersion: "1.10.0.1" diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 9164abc9c..f3a9efd7a 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod10132021 +ARG IMAGE_TAG=ciprod01312022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 248276a08..28c8803c6 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022" imagePullPolicy: IfNotPresent resources: limits: @@ -454,7 +454,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode # - name: omsagent-prometheus - # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022" # imagePullPolicy: IfNotPresent # resources: # limits: @@ -603,7 +603,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod01312022" imagePullPolicy: IfNotPresent resources: limits: @@ -776,7 +776,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod01312022" imagePullPolicy: IfNotPresent resources: limits: @@ -933,4 +933,3 @@ spec: names: plural: healthstates kind: HealthState - \ No newline at end of file diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 55bedf7f5..6a2785e69 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod10132021 +ARG IMAGE_TAG=win-ciprod01312022 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement From 9a292c0bee48c6378a11d4b14f93fda3188d7562 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 31 Jan 2022 15:00:37 -0800 Subject: [PATCH 193/194] fix merge issue --- source/plugins/go/src/oms.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 688c48d5e..407ab3611 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -23,8 +23,6 @@ import ( "Docker-Provider/source/plugins/go/src/extension" - "Docker-Provider/source/plugins/go/src/extension" - lumberjack "gopkg.in/natefinch/lumberjack.v2" "github.com/Azure/azure-kusto-go/kusto/ingest" From 04ebd944abc58ac3332e06df6140cbe1212063d3 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 31 Jan 2022 16:15:03 -0800 Subject: [PATCH 194/194] fix logger exception --- .../ruby/lib/application_insights/channel/sender_base.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb index a6b7966db..e5a4dea62 100644 --- a/source/plugins/ruby/lib/application_insights/channel/sender_base.rb +++ b/source/plugins/ruby/lib/application_insights/channel/sender_base.rb @@ -23,7 +23,7 @@ def initialize(service_endpoint_uri, logger, proxy = {}) @service_endpoint_uri = service_endpoint_uri @queue = nil @send_buffer_size = 100 - if !logger.nil? && !logger.empty? + if !logger.nil? @logger = logger else @logger = Logger.new(STDOUT)