diff --git a/.pipelines/build-linux.sh b/.pipelines/build-linux.sh index f4c92fda2..53f6a3a07 100644 --- a/.pipelines/build-linux.sh +++ b/.pipelines/build-linux.sh @@ -14,3 +14,8 @@ cd $DIR/../build/linux echo "----------- Build Docker Provider -------------------------------" make cd $DIR + +echo "------------ Bundle Shell Extension Scripts & HELM chart -------------------------" +cd $DIR/../deployment/arc-k8s-extension/ServiceGroupRoot/Scripts +tar -czvf ../artifacts.tar.gz ../../../../charts/azuremonitor-containers/ pushChartToAcr.sh + diff --git a/.pipelines/pipeline.user.linux.yml b/.pipelines/pipeline.user.linux.yml index 57273111e..565661d64 100644 --- a/.pipelines/pipeline.user.linux.yml +++ b/.pipelines/pipeline.user.linux.yml @@ -24,10 +24,15 @@ restore: build: commands: - - !!defaultcommand + - !!buildcommand name: 'Build Docker Provider Shell Bundle' command: '.pipelines/build-linux.sh' fail_on_stderr: false + artifacts: + - from: 'deployment' + to: 'build' + include: + - '**' package: commands: diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh old mode 100755 new mode 100644 index 3844ea185..e7d26245f --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-linux-image.sh @@ -35,12 +35,22 @@ echo "end: read appid and appsecret which has read access on cdpx acr" # suffix 00 primary and 01 secondary, and we only use primary # This configured via pipeline variable echo "login to cdpxlinux acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password $CDPX_ACR_APP_SECRET -echo "login to cdpxlinux acr completed: ${CDPX_ACR}" +echo $CDPX_ACR_APP_SECRET | docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to cdpxlinux acr: ${CDPX_ACR} completed successfully." +else + echo "-e error login to cdpxlinux acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "pull agent image from cdpxlinux acr: ${CDPX_ACR}" docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} -echo "pull image from cdpxlinux acr completed: ${CDPX_ACR}" +if [ $? -eq 0 ]; then + echo "pulling of agent image from cdpxlinux acr: ${CDPX_ACR} completed successfully." +else + echo "-e error pulling of agent image from cdpxlinux acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "CI Release name is:"$CI_RELEASE imagetag=$CI_RELEASE$CI_IMAGE_TAG_SUFFIX @@ -51,13 +61,29 @@ echo "CI AGENT REPOSITORY NAME : ${CI_AGENT_REPO}" echo "tag linux agent image" docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +if [ $? -eq 0 ]; then + echo "tagging of linux agent image completed successfully." +else + echo "-e error tagging of linux agent image failed. Please see release task logs." + exit 1 +fi echo "login ciprod acr":$CI_ACR -docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to ${CI_ACR} acr completed" +echo $ACR_APP_SECRET | docker login $CI_ACR --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to ciprod acr: ${CI_ACR} completed successfully" +else + echo "-e error login to ciprod acr: ${CI_ACR} failed. Please see release task logs." + exit 1 +fi echo "pushing the image to ciprod acr:${CI_ACR}" docker push ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} -echo "pushing the image to ciprod acr completed" +if [ $? -eq 0 ]; then + echo "pushing of the image to ciprod acr completed successfully" +else + echo "-e error pushing of image to ciprod acr failed. Please see release task logs." + exit 1 +fi echo "end: pull linux agent image from cdpx and push to ciprod acr" diff --git a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh old mode 100755 new mode 100644 index 095a00039..19fe55722 --- a/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh +++ b/.pipelines/pull-from-cdpx-and-push-to-ci-acr-windows-image.sh @@ -34,12 +34,22 @@ echo "end: read appid and appsecret which has read access on cdpx acr" # suffix 00 primary and 01 secondary, and we only use primary # This configured via pipeline variable echo "login to cdpxwindows acr:${CDPX_ACR}" -docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password $CDPX_ACR_APP_SECRET -echo "login to cdpxwindows acr:${CDPX_ACR} completed" +echo $CDPX_ACR_APP_SECRET | docker login $CDPX_ACR --username $CDPX_ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to cdpxwindows acr: ${CDPX_ACR} completed successfully." +else + echo "-e error login to cdpxwindows acr: ${CDPX_ACR} failed.Please see release task logs." + exit 1 +fi echo "pull image from cdpxwin acr: ${CDPX_ACR}" docker pull ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} -echo "pull image from cdpxwin acr completed: ${CDPX_ACR}" +if [ $? -eq 0 ]; then + echo "pulling of image from cdpxwin acr: ${CDPX_ACR} completed successfully." +else + echo "pulling of image from cdpxwin acr: ${CDPX_ACR} failed. Please see release task logs." + exit 1 +fi echo "CI Release name:"$CI_RELEASE echo "CI Image Tax suffix:"$CI_IMAGE_TAG_SUFFIX @@ -49,13 +59,30 @@ echo "agentimagetag="$imagetag echo "tag windows agent image" docker tag ${CDPX_ACR}/official/${CDPX_REPO_NAME}:${CDPX_AGENT_IMAGE_TAG} ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} +if [ $? -eq 0 ]; then + echo "tagging of windows agent image completed successfully." +else + echo "-e error tagging of windows agent image failed. Please see release task logs." + exit 1 +fi echo "login to ${CI_ACR} acr" -docker login $CI_ACR --username $ACR_APP_ID --password $ACR_APP_SECRET -echo "login to ${CI_ACR} acr completed" +echo $ACR_APP_SECRET | docker login $CI_ACR --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to acr: ${CI_ACR} completed successfully." +else + echo "login to acr: ${CI_ACR} failed. Please see release task logs." + exit 1 +fi + echo "pushing the image to ciprod acr" docker push ${CI_ACR}/public/azuremonitor/containerinsights/${CI_AGENT_REPO}:${imagetag} -echo "pushing the image to ciprod acr completed" +if [ $? -eq 0 ]; then + echo "pushing the image to ciprod acr completed successfully." +else + echo "pushing the image to ciprod acr failed. Please see release task logs" + exit 1 +fi echo "end: pull windows agent image from cdpx and push to ciprod acr" diff --git a/Documentation/OSMPrivatePreview/Image1.jpg b/Documentation/OSMPrivatePreview/Image1.jpg new file mode 100644 index 000000000..04cd03ab1 Binary files /dev/null and b/Documentation/OSMPrivatePreview/Image1.jpg differ diff --git a/Documentation/OSMPrivatePreview/ReadMe.md b/Documentation/OSMPrivatePreview/ReadMe.md new file mode 100644 index 000000000..aa90c7413 --- /dev/null +++ b/Documentation/OSMPrivatePreview/ReadMe.md @@ -0,0 +1,71 @@ +Note - This is private preview. For any support issues, please reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com). Please don't open a support ticket. + +# Azure Monitor Container Insights Open Service Mesh Monitoring + +Azure Monitor container insights now supporting preview of [Open Service Mesh(OSM)](https://docs.microsoft.com/azure/aks/servicemesh-osm-about) Monitoring. As part of this support, customer can: +1. Filter & view inventory of all the services that are part of your service mesh. +2. Visualize and monitor requests between services in your service mesh, with request latency, error rate & resource utilization by services. +3. Provides connection summary for OSM infrastructure running on AKS. + +## How to onboard Container Insights OSM monitoring? +OSM exposes Prometheus metrics which Container Insights can collect, for container insights agent to collect OSM metrics follow the following steps. + +1. Follow this [link](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#register-the-aks-openservicemesh-preview-feature) as a prereq before enabling the addon. + +2. Enable AKS OSM addon on your + - [New AKS cluster](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#install-open-service-mesh-osm-azure-kubernetes-service-aks-add-on-for-a-new-aks-cluster) + - [Existing AKS cluster](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#enable-open-service-mesh-osm-azure-kubernetes-service-aks-add-on-for-an-existing-aks-cluster) +2. Configure OSM to allow Prometheus scraping, follow steps from [here](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#configure-osm-to-allow-prometheus-scraping) +3. To enable namespace(s), download the osm client library [here](https://docs.microsoft.com/en-us/azure/aks/servicemesh-osm-about?pivots=client-operating-system-linux#osm-service-quotas-and-limits-preview) & then enable metrics on namespaces +```bash +# With osm +osm metrics enable --namespace test +osm metrics enable --namespace "test1, test2" + +``` +3. If you are using Azure Monitor Container Insights follow steps below, if not on-board [here.](https://docs.microsoft.com/azure/azure-monitor/containers/container-insights-overview) + * Download the configmap from [here](https://github.com/microsoft/Docker-Provider/blob/ci_prod/kubernetes/container-azm-ms-osmconfig.yaml) + * Add the namespaces you want to monitor in configmap `monitor_namespaces = ["namespace1", "namespace2"]` + * Run the following kubectl command: kubectl apply -f + * Example: `kubectl apply -f container-azm-ms-agentconfig.yaml` +4. The configuration change can take upto 15 mins to finish before taking effect, and all omsagent pods in the cluster will restart. The restart is a rolling restart for all omsagent pods, not all restart at the same time. + + +## Validate the metrics flow +1. Query cluster's Log Analytics workspace InsightsMetrics table to see metrics are flowing or not +``` +InsightsMetrics +| where Name contains "envoy" +| summarize count() by Name +``` + +## How to consume OSM monitoring dashboard? +1. Access your AKS cluster & Container Insights through this [link.](https://aka.ms/azmon/osmux) +2. Go to reports tab and access Open Service Mesh (OSM) workbook. +3. Select the time-range & namespace to scope your services. By default, we only show services deployed by customers and we exclude internal service communication. In case you want to view that you select Show All in the filter. Please note OSM is managed service mesh, we show all internal connections for transparency. + +![alt text](https://github.com/microsoft/Docker-Provider/blob/saarorOSMdoc/Documentation/OSMPrivatePreview/Image1.jpg) +### Requests Tab +1. This tab provides you the summary of all the http requests sent via service to service in OSM. +2. You can view all the services and all the services it is communicating to by selecting the service in grid. +3. You can view total requests, request error rate & P90 latency. +4. You can drill-down to destination and view trends for HTTP error/success code, success rate, Pods resource utilization, latencies at different percentiles. + +### Connections Tab +1. This tab provides you a summary of all the connections between your services in Open Service Mesh. +2. Outbound connections: Total number of connections between Source and destination services. +3. Outbound active connections: Last count of active connections between source and destination in selected time range. +4. Outbound failed connections: Total number of failed connections between source and destination service + +### Troubleshooting guidance when Outbound active connections is 0 or failed connection count is >10k. +1. Please check your connection policy in OSM configuration. +2. If connection policy is fine, please refer the OSM documentation. https://aka.ms/osm/tsg +3. From this view as well, you can drill-down to destination and view trends for HTTP error/success code, success rate, Pods resource utilization, latencies at different percentiles. + + +### Known Issues +1. The workbook has scale limits of 50 pods per namespace. If you have more than 50 pods in mesh you can have workbook loading issues. +2. When source or destination is osmcontroller we show no latency & for internal services we show no resource utilization. +3. When both prometheus scraping using pod annotations and OSM monitoring are enabled on the same set of namespaces, the default set of metrics (envoy_cluster_upstream_cx_total, envoy_cluster_upstream_cx_connect_fail, envoy_cluster_upstream_rq, envoy_cluster_upstream_rq_xx, envoy_cluster_upstream_rq_total, envoy_cluster_upstream_rq_time_bucket, envoy_cluster_upstream_cx_rx_bytes_total, envoy_cluster_upstream_cx_tx_bytes_total, envoy_cluster_upstream_cx_active) will be collected twice. You can follow [this](https://docs.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-prometheus-integration#prometheus-scraping-settings) documentation to exclude these namespaces from pod annotation scraping using the setting monitor_kubernetes_pods_namespaces to work around this issue. + +This is private preview, the goal for us is to get feedback. Please feel free to reach out to us at [askcoin@microsoft.com](mailto:askcoin@microsoft.com) for any feedback and questions! diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 04bd7c6e5..acbd579a0 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,23 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 04/22/2021 - +##### Version microsoft/oms:ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021 (linux) +##### Version microsoft/oms:win-ciprod04222021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021 (windows) +##### Code change log +- Bug fixes for metrics cpuUsagePercentage and memoryWorkingSetPercentage for windows nodes +- Added metrics for threshold violation +- Made Job completion metric configurable +- Udated default buffer sizes in fluent-bit +- Updated recommended alerts +- Fixed bug where logs written before agent starts up were not collected +- Fixed bug which kept agent logs from being rotated +- Bug fix for Windows Containerd container log collection +- Bug fixes +- Doc updates +- Minor telemetry changes + + ### 03/26/2021 - ##### Version microsoft/oms:ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021 (linux) ##### Version microsoft/oms:win-ciprod03262021 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021 (windows) diff --git a/ReleaseProcess.md b/ReleaseProcess.md index c6f51bb65..8ec91546c 100644 --- a/ReleaseProcess.md +++ b/ReleaseProcess.md @@ -35,20 +35,21 @@ Image automatically synched to MCR CN from Public cloud MCR. - Refer to internal docs for the release process and instructions. -## ARO v3 - -This needs to be co-ordinated with Red hat and ARO-RP team for the release and Red hat team will pick up the changes for the release. - ## AKS-Engine Make PR against [AKS-Engine](https://github.com/Azure/aks-engine). Refer PR https://github.com/Azure/aks-engine/pull/2318 -## ARO v4, Azure Arc K8s and OpenShift v4 clusters - -Make sure azuremonitor-containers chart yamls updates with all changes going with the release and also make sure to bump the chart version, imagetag and docker provider version etc. Similar to agent container image, build pipeline automatically push the chart to container insights prod acr for canary and prod repos accordingly. -Both the agent and helm chart will be replicated to `mcr.microsoft.com`. +## Arc for Kubernetes -The way, customers will be onboard the monitoring to these clusters using onboarding scripts under `onboarding\managed` directory so please bump chart version for prod release. Once we move to Arc K8s Monitoring extension Public preview, these will be taken care so at that point of time no manual changes like this required. +Ev2 pipeline used to deploy the chart of the Arc K8s Container Insights Extension as per Safe Deployment Process. +Here is the high level process +``` + 1. Specify chart version of the release candidate and trigger [container-insights-arc-k8s-extension-ci_prod-release](https://github-private.visualstudio.com/microsoft/_release?_a=releases&view=all) + 2. Get the approval from one of team member for the release + 3. Once the approved, release should be triggered automatically + 4. use `cimon-arck8s-eastus2euap` for validating latest release in canary region + 5. TBD - Notify vendor team for the validation on all Arc K8s supported platforms +``` ## Microsoft Charts Repo release for On-prem K8s diff --git a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf index 720f54820..8a69f7995 100644 --- a/build/linux/installer/conf/td-agent-bit-prom-side-car.conf +++ b/build/linux/installer/conf/td-agent-bit-prom-side-car.conf @@ -11,14 +11,27 @@ Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + Read_from_Head true + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* Listen 0.0.0.0 Port 25229 - Chunk_Size 1m - Buffer_Size 1m - Mem_Buf_Limit 20m + Chunk_Size 10m + Buffer_Size 10m + Mem_Buf_Limit 200m [OUTPUT] Name oms diff --git a/build/linux/installer/conf/td-agent-bit-rs.conf b/build/linux/installer/conf/td-agent-bit-rs.conf index 696ac80e6..9613c270d 100644 --- a/build/linux/installer/conf/td-agent-bit-rs.conf +++ b/build/linux/installer/conf/td-agent-bit-rs.conf @@ -10,6 +10,19 @@ Parsers_File /etc/opt/microsoft/docker-cimprov/azm-containers-parser.conf Log_File /var/opt/microsoft/docker-cimprov/log/fluent-bit.log +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + Read_from_Head true + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/build/linux/installer/conf/td-agent-bit.conf b/build/linux/installer/conf/td-agent-bit.conf index 484a4bbbf..045aefcaf 100644 --- a/build/linux/installer/conf/td-agent-bit.conf +++ b/build/linux/installer/conf/td-agent-bit.conf @@ -15,6 +15,7 @@ Name tail Tag oms.container.log.la.* Path ${AZMON_LOG_TAIL_PATH} + Read_from_Head true DB /var/log/omsagent-fblogs.db DB.Sync Off Parser docker @@ -32,6 +33,7 @@ Name tail Tag oms.container.log.flbplugin.* Path /var/log/containers/omsagent*.log + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/omsagent-ai.db DB.Sync Off Parser docker @@ -44,6 +46,7 @@ Name tail Tag oms.container.log.flbplugin.mdsd.* Path /var/opt/microsoft/linuxmonagent/log/mdsd.err + Read_from_Head true DB /var/opt/microsoft/docker-cimprov/state/mdsd-ai.db DB.Sync Off Parser docker @@ -52,6 +55,19 @@ Skip_Long_Lines On Ignore_Older 2m +[INPUT] + Name tail + Tag oms.container.log.flbplugin.terminationlog.* + Path /dev/write-to-traces + Read_from_Head true + DB /var/opt/microsoft/docker-cimprov/state/terminationlog-ai.db + DB.Sync Off + Parser docker + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + [INPUT] Name tcp Tag oms.container.perf.telegraf.* diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index a82fa28eb..e3b0fc28e 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -26,6 +26,15 @@ then exit 1 fi +#test to exit non zero value if telegraf is not running +(ps -ef | grep telegraf | grep -v "grep") +if [ $? -ne 0 ] +then + echo "Telegraf is not running" > /dev/termination-log + echo "Telegraf is not running (controller: ${CONTROLLER_TYPE}, container type: ${CONTAINER_TYPE})" > /dev/write-to-traces # this file is tailed and sent to traces + exit 1 +fi + if [ -s "inotifyoutput.txt" ] then # inotifyoutput file has data(config map was applied) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 345c51633..5ce5d79d2 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -13,6 +13,7 @@ @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD +@jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -101,6 +102,25 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end + + # Get mdm metrics config settings for job completion + begin + jobCompletion = parsedConfig[:alertable_metrics_configuration_settings][:job_completion_threshold] + if !jobCompletion.nil? + jobCompletionThreshold = jobCompletion[:job_completion_threshold_time_minutes] + jobCompletionThresholdInt = jobCompletionThreshold.to_i + if jobCompletionThresholdInt.kind_of? Integer + @jobCompletionThresholdMinutes = jobCompletionThresholdInt + else + puts "config::Non interger value or value not convertible to integer specified for job completion threshold, using default " + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end + puts "config::Using config map settings for MDM metric configuration settings for job completion" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for job completion - #{errorStr}, using defaults, please check config map for errors") + @jobCompletionThresholdMinutes = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES + end end end @@ -125,6 +145,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") + file.write("export AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD=#{@jobCompletionThresholdMinutes}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/build/version b/build/version index 83a0a174b..16a43604a 100644 --- a/build/version +++ b/build/version @@ -2,11 +2,11 @@ # Build Version Information -CONTAINER_BUILDVERSION_MAJOR=14 +CONTAINER_BUILDVERSION_MAJOR=15 CONTAINER_BUILDVERSION_MINOR=0 CONTAINER_BUILDVERSION_PATCH=0 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20210326 +CONTAINER_BUILDVERSION_DATE=20210422 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/Chart.yaml b/charts/azuremonitor-containers/Chart.yaml index 9c8014ed0..00f3f49ed 100644 --- a/charts/azuremonitor-containers/Chart.yaml +++ b/charts/azuremonitor-containers/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v1 appVersion: 7.0.0-1 description: Helm chart for deploying Azure Monitor container monitoring agent in Kubernetes name: azuremonitor-containers -version: 2.8.2 +version: 2.8.3 kubeVersion: "^1.10.0-0" keywords: - monitoring diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 8868b86bb..580ef9d15 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -118,6 +118,7 @@ spec: - name: docker-windows-containers hostPath: path: C:\ProgramData\docker\containers + type: DirectoryOrCreate - name: settings-vol-config configMap: name: container-azm-ms-agentconfig diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index fc2a3afaf..219198ebb 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -21,10 +21,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod03262021" - tagWindows: "win-ciprod03262021" + tag: "ciprod04222021" + tagWindows: "win-ciprod04222021" pullPolicy: IfNotPresent - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" agentVersion: "1.10.0.1" # The priority used by the omsagent priority class for the daemonset pods @@ -124,6 +124,21 @@ omsagent: operator: In values: - amd64 + nodeSelectorTerms: + - labelSelector: + matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - key: type + operator: NotIn + values: + - virtual-kubelet + - key: kubernetes.io/arch + operator: In + values: + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json new file mode 100644 index 000000000..a8a99e9f6 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Parameters/ContainerInsightsExtension.Parameters.json @@ -0,0 +1,66 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutParameters.json", + "contentVersion": "1.0.0.0", + "wait": [ + { + "name": "waitSdpBakeTime", + "properties": { + "duration": "PT24H" + } + } + ], + "shellExtensions": [ + { + "name": "PushChartToACR", + "type": "ShellExtensionType", + "properties": { + "maxexecutiontime": "PT1H" + }, + "package": { + "reference": { + "path": "artifacts.tar.gz" + } + }, + "launch": { + "command": [ + "/bin/bash", + "pushChartToAcr.sh" + ], + "environmentVariables": [ + { + "name": "RELEASE_STAGE", + "value": "__RELEASE_STAGE__" + }, + { + "name": "ACR_APP_ID", + "reference": { + "provider": "AzureKeyVault", + "parameters": { + "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappid/e8f47bf7505741ebaf65a4db16ff9fa7" + } + }, + "asSecureValue": "true" + }, + { + "name": "ACR_APP_SECRET", + "reference": { + "provider": "AzureKeyVault", + "parameters": { + "secretId": "https://cibuildandreleasekv.vault.azure.net/secrets/ciprodacrappsecret/8718afcdac114accb8b26f613cef1e1e" + } + }, + "asSecureValue": "true" + }, + { + "name": "ACR_NAME", + "value": "__ACR_NAME__" + }, + { + "name": "CHART_VERSION", + "value": "__CHART_VERSION__" + } + ] + } + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json new file mode 100644 index 000000000..cde103633 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Canary.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Canary", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-Canary", + "actions": [ "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json new file mode 100644 index 000000000..1749296c8 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.FF.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-FF", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-FF", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json new file mode 100644 index 000000000..50729b1ae --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.HighLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod3", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-HighLoad", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json new file mode 100644 index 000000000..edd61f852 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.LightLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod2", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-LightLoad", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json new file mode 100644 index 000000000..014f4b092 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MC.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-MC", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-MC", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json new file mode 100644 index 000000000..cd1befbc3 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.MediumLoad.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Prod2", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-MediumLoad", + "actions": ["wait/waitSdpBakeTime", "Shell/PushChartToACR" ], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json new file mode 100644 index 000000000..48c99fce1 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/RolloutSpecs/Public.Pilot.RolloutSpec.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/RolloutSpec.json", + "ContentVersion": "1.0.0.0", + "RolloutMetadata": { + "ServiceModelPath": "ServiceModels//Public.ServiceModel.json", + "ScopeBindingsPath": "ScopeBindings//Public.ScopeBindings.json", + "Name": "ContainerInsightsExtension-Pilot", + "RolloutType": "Major", + "BuildSource": { + "Parameters": { + "VersionFile": "buildver.txt" + } + }, + "notification": { + "email": { + "to": "omscontainers@microsoft.com" + } + } + }, + "orchestratedSteps": [ + { + "name": "PushChartToACR", + "targetType": "ServiceResource", + "targetName": "PushChartToACR-Pilot", + "actions": [ "wait/waitSdpBakeTime", "Shell/PushChartToACR"], + "dependsOn": [ ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json new file mode 100644 index 000000000..516eba3e2 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ScopeBindings/Public.ScopeBindings.json @@ -0,0 +1,125 @@ +{ + "$schema": "https://ev2schema.azure.net/schemas/2020-01-01/scopeBindings.json", + "contentVersion": "0.0.0.1", + "scopeBindings": [ + { + "scopeTagName": "Canary", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "Canary" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "Pilot", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "Pilot" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "LightLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MediumLow" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "MediumLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MediumHigh" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "HighLoad", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "HighLoad" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "FF", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "FF" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + }, + { + "scopeTagName": "MC", + "bindings": [ + { + "find": "__RELEASE_STAGE__", + "replaceWith": "MC" + }, + { + "find": "__ACR_NAME__", + "replaceWith": "$(ACRName)" + }, + { + "find": "__CHART_VERSION__", + "replaceWith": "$(ChartVersion)" + } + ] + } + ] +} diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh new file mode 100644 index 000000000..520557592 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/Scripts/pushChartToAcr.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +export HELM_EXPERIMENTAL_OCI=1 +export MCR_NAME="mcr.microsoft.com" +# for prod-> stable and for test -> preview +export REPO_TYPE="stable" + +# repo paths for arc k8s extension roll-out +# canary region +export CANARY_REGION_REPO_PATH="azuremonitor/containerinsights/canary/${REPO_TYPE}/azuremonitor-containers" +# pilot region +export PILOT_REGION_REPO_PATH="azuremonitor/containerinsights/prod1/${REPO_TYPE}/azuremonitor-containers" +# light load regions +export LIGHT_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod2/${REPO_TYPE}/azuremonitor-containers" +# medium load regions +export MEDIUM_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod3/${REPO_TYPE}/azuremonitor-containers" +# high load regions +export HIGH_LOAD_REGION_REPO_PATH="azuremonitor/containerinsights/prod4/${REPO_TYPE}/azuremonitor-containers" +# FairFax regions +export FF_REGION_REPO_PATH="azuremonitor/containerinsights/prod5/${REPO_TYPE}/azuremonitor-containers" +# Mooncake regions +export MC_REGION_REPO_PATH="azuremonitor/containerinsights/prod6/${REPO_TYPE}/azuremonitor-containers" + +# pull chart from previous stage mcr and push chart to next stage acr +pull_chart_from_source_mcr_to_push_to_dest_acr() { + srcMcrFullPath=${1} + destAcrFullPath=${2} + + if [ -z $srcMcrFullPath ]; then + echo "-e error source mcr path must be provided " + exit 1 + fi + + if [ -z $destAcrFullPath ]; then + echo "-e error dest acr path must be provided " + exit 1 + fi + + echo "Pulling chart from MCR:${srcMcrFullPath} ..." + helm chart pull ${srcMcrFullPath} + if [ $? -eq 0 ]; then + echo "Pulling chart from MCR:${srcMcrFullPath} completed successfully." + else + echo "-e error Pulling chart from MCR:${srcMcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "Exporting chart to current directory ..." + helm chart export ${srcMcrFullPath} + if [ $? -eq 0 ]; then + echo "Exporting chart to current directory completed successfully." + else + echo "-e error Exporting chart to current directory failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "save the chart locally with dest acr full path : ${destAcrFullPath} ..." + helm chart save azuremonitor-containers/ ${destAcrFullPath} + if [ $? -eq 0 ]; then + echo "save the chart locally with dest acr full path : ${destAcrFullPath} completed successfully." + else + echo "-e error save the chart locally with dest acr full path : ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "pushing the chart to acr path: ${destAcrFullPath} ..." + helm chart push ${destAcrFullPath} + if [ $? -eq 0 ]; then + echo "pushing the chart to acr path: ${destAcrFullPath} completed successfully." + else + echo "-e error pushing the chart to acr path: ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi +} + +# push to local release candidate chart to canary region +push_local_chart_to_canary_region() { + destAcrFullPath=${1} + if [ -z $destAcrFullPath ]; then + echo "-e error dest acr path must be provided " + exit 1 + fi + + echo "save the chart locally with dest acr full path : ${destAcrFullPath} ..." + helm chart save charts/azuremonitor-containers/ $destAcrFullPath + if [ $? -eq 0 ]; then + echo "save the chart locally with dest acr full path : ${destAcrFullPath} completed." + else + echo "-e error save the chart locally with dest acr full path : ${destAcrFullPath} failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi + + echo "pushing the chart to acr path: ${destAcrFullPath} ..." + helm chart push $destAcrFullPath + if [ $? -eq 0 ]; then + echo "pushing the chart to acr path: ${destAcrFullPath} completed successfully." + else + echo "-e error pushing the chart to acr path: ${destAcrFullPath} failed.Please review Ev2 pipeline logs for more details on the error." + exit 1 + fi +} + +echo "START - Release stage : ${RELEASE_STAGE}" + +# login to acr +echo "Using acr : ${ACR_NAME}" +echo "Using acr repo type: ${REPO_TYPE}" + +echo "login to acr:${ACR_NAME} using helm ..." +echo $ACR_APP_SECRET | helm registry login $ACR_NAME --username $ACR_APP_ID --password-stdin +if [ $? -eq 0 ]; then + echo "login to acr:${ACR_NAME} using helm completed successfully." +else + echo "-e error login to acr:${ACR_NAME} using helm failed. Please review Ev2 pipeline logs for more details on the error." + exit 1 +fi + +case $RELEASE_STAGE in + + Canary) + echo "START: Release stage - Canary" + destAcrFullPath=${ACR_NAME}/public/${CANARY_REGION_REPO_PATH}:${CHART_VERSION} + push_local_chart_to_canary_region $destAcrFullPath + echo "END: Release stage - Canary" + ;; + + Pilot | Prod1) + echo "START: Release stage - Pilot" + srcMcrFullPath=${MCR_NAME}/${CANARY_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${PILOT_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Pilot" + ;; + + LightLoad | Pord2) + echo "START: Release stage - Light Load Regions" + srcMcrFullPath=${MCR_NAME}/${PILOT_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${LIGHT_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Light Load Regions" + ;; + + MediumLoad | Prod3) + echo "START: Release stage - Medium Load Regions" + srcMcrFullPath=${MCR_NAME}/${LIGHT_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${MEDIUM_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - Medium Load Regions" + ;; + + HighLoad | Prod4) + echo "START: Release stage - High Load Regions" + srcMcrFullPath=${MCR_NAME}/${MEDIUM_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${HIGH_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - High Load Regions" + ;; + + FF | Prod5) + echo "START: Release stage - FF" + srcMcrFullPath=${MCR_NAME}/${HIGH_LOAD_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${FF_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - FF" + ;; + + MC | Prod6) + echo "START: Release stage - MC" + srcMcrFullPath=${MCR_NAME}/${FF_REGION_REPO_PATH}:${CHART_VERSION} + destAcrFullPath=${ACR_NAME}/public/${MC_REGION_REPO_PATH}:${CHART_VERSION} + pull_chart_from_source_mcr_to_push_to_dest_acr $srcMcrFullPath $destAcrFullPath + echo "END: Release stage - MC" + ;; + + *) + echo -n "unknown release stage" + exit 1 + ;; +esac + +echo "END - Release stage : ${RELEASE_STAGE}" diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json new file mode 100644 index 000000000..71081661a --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/ServiceModels/Public.ServiceModel.json @@ -0,0 +1,159 @@ +{ + "$schema": "http://schema.express.azure.com/schemas/2015-01-01-alpha/ServiceModel.json", + "ContentVersion": "0.0.0.1", + "ServiceMetadata": { + "ServiceGroup": "ContainerInsightsExtension", + "Environment": "Prod" + }, + "ServiceResourceGroupDefinitions": [ + { + "Name": "ARC-Extension-ServiceResourceGroupDefinition", + "ServiceResourceDefinitions": [ + { + "Name": "ShellExtension", + "ComposedOf": { + "Extension": { + "Shell": [ + { + "type": "ShellExtensionType", + "properties": { + "imageName": "adm-ubuntu-1804-l", + "imageVersion": "v18" + } + } + ] + } + } + } + ] + } + ], + "ServiceResourceGroups": [ + { + "AzureResourceGroupName": "ContainerInsightsExtension-Canary-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "Canary" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-Canary", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-Pilot-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "Pilot" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-Pilot", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-LightLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "LightLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-LightLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-MediumLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "MediumLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-MediumLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-HighLoad-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "HighLoad" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-HighLoad", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-FF-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "FF" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-FF", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + }, + { + "AzureResourceGroupName": "ContainerInsightsExtension-MC-Release", + "Location": "eastus2", + "InstanceOf": "ARC-Extension-ServiceResourceGroupDefinition", + "AzureSubscriptionId": "5fab7b6f-6150-42fe-89e1-0f07a0a9a46f", + "ScopeTags": [ + { + "Name": "MC" + } + ], + "ServiceResources": [ + { + "Name": "PushChartToACR-MC", + "InstanceOf": "ShellExtension", + "RolloutParametersPath": "Parameters\\ContainerInsightsExtension.Parameters.json" + } + ] + } + ] + } diff --git a/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt b/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt new file mode 100644 index 000000000..1921233b3 --- /dev/null +++ b/deployment/arc-k8s-extension/ServiceGroupRoot/buildver.txt @@ -0,0 +1 @@ +1.0.0.0 diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index e38d9b4ab..543f270c1 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -126,6 +126,11 @@ data: [alertable_metrics_configuration_settings.pv_utilization_thresholds] # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage pv_usage_threshold_percentage = 60.0 + + # Alertable metrics configuration settings for completed jobs count + [alertable_metrics_configuration_settings.job_completion_threshold] + # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold + job_completion_threshold_time_minutes = 360 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index 76b8622b4..d5ece4509 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:18.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=ciprod03262021 +ARG IMAGE_TAG=ciprod04222021 ENV AGENT_VERSION ${IMAGE_TAG} ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 71e46875b..81db6f3a4 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -699,6 +699,10 @@ dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' +# Write messages from the liveness probe to stdout (so telemetry picks it up) +touch /dev/write-to-traces + + echo "stopping rsyslog..." service rsyslog stop diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 218e3c717..ee3756964 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -85,3 +85,7 @@ rm -f $TMPDIR/docker-cimprov*.sh rm -f $TMPDIR/azure-mdsd*.deb rm -f $TMPDIR/mdsd.xml rm -f $TMPDIR/envmdsd + +# Remove settings for cron.daily that conflict with the node's cron.daily. Since both are trying to rotate the same files +# in /var/log at the same time, the rotation doesn't happen correctly and then the *.1 file is forever logged to. +rm /etc/logrotate.d/alternatives /etc/logrotate.d/apt /etc/logrotate.d/azure-mdsd /etc/logrotate.d/rsyslog diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 206d9a8f0..feea3f29a 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -358,7 +358,7 @@ spec: tier: node annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -368,7 +368,7 @@ spec: value: "3" containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: @@ -446,12 +446,12 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: cpu: 500m - memory: 400Mi + memory: 1Gi requests: cpu: 75m memory: 225Mi @@ -583,13 +583,13 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: @@ -750,7 +750,7 @@ spec: tier: node-win annotations: agentVersion: "1.10.0.1" - dockerProviderVersion: "14.0.0-0" + dockerProviderVersion: "15.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -760,7 +760,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03262021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod04222021" imagePullPolicy: IfNotPresent resources: limits: @@ -839,6 +839,7 @@ spec: - name: docker-windows-containers hostPath: path: C:\ProgramData\docker\containers + type: DirectoryOrCreate - name: settings-vol-config configMap: name: container-azm-ms-agentconfig diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index e4ace417a..fefd089a8 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod03262021 +ARG IMAGE_TAG=win-ciprod04222021 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/scripts/onboarding/managed/enable-monitoring.ps1 b/scripts/onboarding/managed/enable-monitoring.ps1 index ec57bf981..02ea73267 100644 --- a/scripts/onboarding/managed/enable-monitoring.ps1 +++ b/scripts/onboarding/managed/enable-monitoring.ps1 @@ -46,10 +46,10 @@ param( [Parameter(mandatory = $false)] [string]$kubeContext, [Parameter(mandatory = $false)] - [string]$servicePrincipalClientSecret, - [Parameter(mandatory = $false)] [string]$tenantId, [Parameter(mandatory = $false)] + [string]$kubeContext, + [Parameter(mandatory = $false)] [string]$azureCloudName ) @@ -64,7 +64,7 @@ $isUsingServicePrincipal = $false # released chart version in mcr $mcr = "mcr.microsoft.com" -$mcrChartVersion = "2.8.2" +$mcrChartVersion = "2.8.3" $mcrChartRepoPath = "azuremonitor/containerinsights/preview/azuremonitor-containers" $helmLocalRepoName = "." $omsAgentDomainName="opinsights.azure.com" diff --git a/scripts/onboarding/managed/enable-monitoring.sh b/scripts/onboarding/managed/enable-monitoring.sh index 9747d932d..f27f944fd 100644 --- a/scripts/onboarding/managed/enable-monitoring.sh +++ b/scripts/onboarding/managed/enable-monitoring.sh @@ -44,7 +44,7 @@ defaultAzureCloud="AzureCloud" omsAgentDomainName="opinsights.azure.com" # released chart version in mcr -mcrChartVersion="2.8.2" +mcrChartVersion="2.8.3" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" helmLocalRepoName="." diff --git a/scripts/onboarding/managed/upgrade-monitoring.sh b/scripts/onboarding/managed/upgrade-monitoring.sh index 1cf7b5c97..5456a7072 100644 --- a/scripts/onboarding/managed/upgrade-monitoring.sh +++ b/scripts/onboarding/managed/upgrade-monitoring.sh @@ -20,7 +20,7 @@ set -e set -o pipefail # released chart version for Azure Arc enabled Kubernetes public preview -mcrChartVersion="2.8.2" +mcrChartVersion="2.8.3" mcr="mcr.microsoft.com" mcrChartRepoPath="azuremonitor/containerinsights/preview/azuremonitor-containers" diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json index 0069307b7..95e7ba5d0 100644 --- a/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterOnboarding.json @@ -14,13 +14,6 @@ "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" } }, - "proxyEndpointUrl": { - "type": "string", - "defaultValue": "", - "metadata": { - "description": "If the cluster behind forward proxy, then specify Proxy Endpoint URL in this format: http(s)://:@:" - } - }, "workspaceResourceId": { "type": "string", "metadata": { diff --git a/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json index b74b5ac95..6829d3d05 100644 --- a/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json +++ b/scripts/onboarding/templates/arc-k8s-extension/existingClusterParam.json @@ -8,9 +8,6 @@ "clusterRegion": { "value": "" }, - "proxyEndpointUrl": { - "value": "" - }, "workspaceResourceId": { "value": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/" }, diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 48f82a9ab..461fdea96 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -153,8 +153,6 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) - } else if strings.Compare(strings.ToLower(os.Getenv("OS_TYPE")), "windows") == 0 { - SendEvent(eventNameWindowsFluentBitHeartbeat, make(map[string]string)) } else { SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) flushRateMetric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index c5a363741..98347d272 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -31,6 +31,8 @@ class KubernetesApiClient @@TokenStr = nil @@NodeMetrics = Hash.new @@WinNodeArray = [] + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} def initialize end @@ -403,9 +405,12 @@ def getPodUid(podNameSpace, podMetadata) def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToCollect, metricNametoReturn, metricTime = Time.now.utc.iso8601) metricItems = [] + timeDifference = (DateTime.now.to_time.to_i - @@telemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 begin clusterId = getClusterId podNameSpace = pod["metadata"]["namespace"] + podName = pod["metadata"]["name"] podUid = getPodUid(podNameSpace, pod["metadata"]) if podUid.nil? return metricItems @@ -456,6 +461,33 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricProps["Collections"].push(metricCollections) metricItem["DataItems"].push(metricProps) metricItems.push(metricItem) + #Telemetry about omsagent requests and limits + begin + if (podName.downcase.start_with?("omsagent-") && podNameSpace.eql?("kube-system") && containerName.downcase.start_with?("omsagent")) + nodePodContainerKey = [nodeName, podName, containerName, metricNametoReturn].join("~~") + @@resourceLimitsTelemetryHash[nodePodContainerKey] = metricValue + end + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@resourceLimitsTelemetryHash.each { |key, value| + keyElements = key.split("~~") + if keyElements.length != 4 + next + end + + # get dimension values by key + telemetryProps = {} + telemetryProps["Computer"] = keyElements[0] + telemetryProps["PodName"] = keyElements[1] + telemetryProps["ContainerName"] = keyElements[2] + metricNameFromKey = keyElements[3] + ApplicationInsightsUtility.sendMetricTelemetry(metricNameFromKey, value, telemetryProps) + } + @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@resourceLimitsTelemetryHash = {} + end + rescue => errorStr + $log.warn("Exception while generating Telemetry from getContainerResourceRequestsAndLimits failed: #{errorStr} for metric #{metricNameToCollect}") + end #No container level limit for the given metric, so default to node level limit else nodeMetricsHashKey = clusterId + "/" + nodeName + "_" + "allocatable" + "_" + metricNameToCollect @@ -791,7 +823,7 @@ def getKubeAPIServerUrl def getKubeServicesInventoryRecords(serviceList, batchTime = Time.utc.iso8601) kubeServiceRecords = [] begin - if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty? ) + if (!serviceList.nil? && !serviceList.empty? && serviceList.key?("items") && !serviceList["items"].nil? && !serviceList["items"].empty?) servicesCount = serviceList["items"].length @Log.info("KubernetesApiClient::getKubeServicesInventoryRecords : number of services in serviceList #{servicesCount} @ #{Time.now.utc.iso8601}") serviceList["items"].each do |item| diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index ef63cf219..e889c3f09 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -28,7 +28,7 @@ class MdmAlertTemplates } }' - Stable_job_metrics_template = ' + Stable_job_metrics_template = ' { "time": "%{timestamp}", "data": { @@ -45,7 +45,7 @@ class MdmAlertTemplates "dimValues": [ "%{controllerNameDimValue}", "%{namespaceDimValue}", - "6" + "%{jobCompletionThreshold}" ], "min": %{containerCountMetricValue}, "max": %{containerCountMetricValue}, @@ -90,6 +90,39 @@ class MdmAlertTemplates } }' + Container_resource_threshold_violation_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/containers", + "dimNames": [ + "containerName", + "podName", + "controllerName", + "Kubernetes namespace", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{containerNameDimValue}", + "%{podNameDimValue}", + "%{controllerNameDimValue}", + "%{namespaceDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{containerResourceThresholdViolated}, + "max": %{containerResourceThresholdViolated}, + "sum": %{containerResourceThresholdViolated}, + "count": 1 + } + ] + } + } + }' + PV_resource_utilization_template = ' { "time": "%{timestamp}", @@ -123,6 +156,38 @@ class MdmAlertTemplates } }' + PV_resource_threshold_violation_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolumes", + "dimNames": [ + "podName", + "node", + "kubernetesNamespace", + "volumeName", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{volumeNameDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{pvResourceThresholdViolated}, + "max": %{pvResourceThresholdViolated}, + "sum": %{pvResourceThresholdViolated}, + "count": 1 + } + ] + } + } + }' Node_resource_metrics_template = ' { diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 12d462e44..199cacd37 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -39,10 +39,21 @@ class MdmMetricsGenerator Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, } + @@container_metric_name_metric_threshold_violated_hash = { + Constants::CPU_USAGE_MILLI_CORES => Constants::MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC, + Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC, + Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_THRESHOLD_VIOLATED_METRIC, + Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_THRESHOLD_VIOLATED_METRIC, + } + @@pod_metric_name_metric_percentage_name_hash = { Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC, } + @@pod_metric_name_metric_threshold_violated_hash = { + Constants::PV_USED_BYTES => Constants::MDM_PV_THRESHOLD_VIOLATED_METRIC, + } + # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @sendZeroFilledMetrics = true @zeroFilledMetricsTimeTracker = DateTime.now.to_time.to_i @@ -96,13 +107,28 @@ def appendPodMetrics(records, metricName, metricHash, batch_time, metricsTemplat podControllerNameDimValue = key_elements[0] podNamespaceDimValue = key_elements[1] - record = metricsTemplate % { - timestamp: batch_time, - metricName: metricName, - controllerNameDimValue: podControllerNameDimValue, - namespaceDimValue: podNamespaceDimValue, - containerCountMetricValue: value, - } + # Special handling for jobs since we need to send the threshold as a dimension as it is configurable + if metricName == Constants::MDM_STALE_COMPLETED_JOB_COUNT + metric_threshold_hash = getContainerResourceUtilizationThresholds + #Converting this to hours since we already have olderThanHours dimension. + jobCompletionThresholdHours = (metric_threshold_hash[Constants::JOB_COMPLETION_TIME] / 60.0).round(2) + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + jobCompletionThreshold: jobCompletionThresholdHours, + } + else + record = metricsTemplate % { + timestamp: batch_time, + metricName: metricName, + controllerNameDimValue: podControllerNameDimValue, + namespaceDimValue: podNamespaceDimValue, + containerCountMetricValue: value, + } + end records.push(Yajl::Parser.parse(StringIO.new(record))) } else @@ -129,9 +155,11 @@ def flushPodMdmMetricTelemetry staleJobHashValues = @stale_job_count_hash.values staleJobMetricCount = staleJobHashValues.inject(0) { |sum, x| sum + x } + metric_threshold_hash = getContainerResourceUtilizationThresholds properties["ContainerRestarts"] = containerRestartMetricCount properties["OomKilledContainers"] = oomKilledContainerMetricCount properties["OldCompletedJobs"] = staleJobMetricCount + properties["JobCompletionThesholdTimeInMinutes"] = metric_threshold_hash[Constants::JOB_COMPLETION_TIME] ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_METRICS_HEART_BEAT_EVENT, properties) ApplicationInsightsUtility.sendCustomEvent(Constants::POD_READY_PERCENTAGE_HEART_BEAT_EVENT, {}) rescue => errorStr @@ -158,29 +186,63 @@ def zeroFillMetricRecords(records, batch_time) metric_threshold_hash = getContainerResourceUtilizationThresholds container_zero_fill_dims = [Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::OMSAGENT_ZERO_FILL, Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL].join("~~") - containerCpuRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::CPU_USAGE_NANO_CORES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES]) - if !containerCpuRecord.nil? && !containerCpuRecord.empty? && !containerCpuRecord[0].nil? && !containerCpuRecord[0].empty? - records.push(containerCpuRecord[0]) + containerCpuRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::CPU_USAGE_NANO_CORES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES], + true) + if !containerCpuRecords.nil? && !containerCpuRecords.empty? + containerCpuRecords.each { |cpuRecord| + if !cpuRecord.nil? && !cpuRecord.empty? + records.push(cpuRecord) + end + } end - containerMemoryRssRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::MEMORY_RSS_BYTES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::MEMORY_RSS_BYTES]) - if !containerMemoryRssRecord.nil? && !containerMemoryRssRecord.empty? && !containerMemoryRssRecord[0].nil? && !containerMemoryRssRecord[0].empty? - records.push(containerMemoryRssRecord[0]) + containerMemoryRssRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::MEMORY_RSS_BYTES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::MEMORY_RSS_BYTES], + true) + if !containerMemoryRssRecords.nil? && !containerMemoryRssRecords.empty? + containerMemoryRssRecords.each { |memoryRssRecord| + if !memoryRssRecord.nil? && !memoryRssRecord.empty? + records.push(memoryRssRecord) + end + } end - containerMemoryWorkingSetRecord = getContainerResourceUtilMetricRecords(batch_time, - Constants::MEMORY_WORKING_SET_BYTES, - 0, - container_zero_fill_dims, - metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES]) - if !containerMemoryWorkingSetRecord.nil? && !containerMemoryWorkingSetRecord.empty? && !containerMemoryWorkingSetRecord[0].nil? && !containerMemoryWorkingSetRecord[0].empty? - records.push(containerMemoryWorkingSetRecord[0]) + containerMemoryWorkingSetRecords = getContainerResourceUtilMetricRecords(batch_time, + Constants::MEMORY_WORKING_SET_BYTES, + 0, + container_zero_fill_dims, + metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES], + true) + if !containerMemoryWorkingSetRecords.nil? && !containerMemoryWorkingSetRecords.empty? + containerMemoryWorkingSetRecords.each { |workingSetRecord| + if !workingSetRecord.nil? && !workingSetRecord.empty? + records.push(workingSetRecord) + end + } + end + + pvZeroFillDims = {} + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = Constants::KUBESYSTEM_NAMESPACE_ZERO_FILL + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = Constants::OMSAGENT_ZERO_FILL + pvZeroFillDims[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = Constants::VOLUME_NAME_ZERO_FILL + pvResourceUtilMetricRecords = getPVResourceUtilMetricRecords(batch_time, + Constants::PV_USED_BYTES, + @@hostName, + 0, + pvZeroFillDims, + metric_threshold_hash[Constants::PV_USED_BYTES], + true) + if !pvResourceUtilMetricRecords.nil? && !pvResourceUtilMetricRecords.empty? + pvResourceUtilMetricRecords.each { |pvRecord| + if !pvRecord.nil? && !pvRecord.empty? + records.push(pvRecord) + end + } end pvZeroFillDims = {} @@ -247,7 +309,7 @@ def appendAllPodMetrics(records, batch_time) return records end - def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) + def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage, isZeroFill = false) records = [] begin if dims.nil? @@ -276,6 +338,19 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag thresholdPercentageDimValue: thresholdPercentage, } records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + + # Adding another metric for threshold violation + resourceThresholdViolatedRecord = MdmAlertTemplates::Container_resource_threshold_violation_template % { + timestamp: recordTimeStamp, + metricName: @@container_metric_name_metric_threshold_violated_hash[metricName], + containerNameDimValue: containerName, + podNameDimValue: podName, + controllerNameDimValue: controllerName, + namespaceDimValue: podNamespace, + containerResourceThresholdViolated: isZeroFill ? 0 : 1, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceThresholdViolatedRecord))) rescue => errorStr @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -283,7 +358,7 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end - def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage) + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage, isZeroFill = false) records = [] begin containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] @@ -303,6 +378,19 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen thresholdPercentageDimValue: thresholdPercentage, } records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + + # Adding another metric for threshold violation + resourceThresholdViolatedRecord = MdmAlertTemplates::PV_resource_threshold_violation_template % { + timestamp: recordTimeStamp, + metricName: @@pod_metric_name_metric_threshold_violated_hash[metricName], + podNameDimValue: podName, + computerNameDimValue: computer, + namespaceDimValue: pvcNamespace, + volumeNameDimValue: volumeName, + pvResourceThresholdViolated: isZeroFill ? 0 : 1, + thresholdPercentageDimValue: thresholdPercentage, + } + records.push(Yajl::Parser.parse(StringIO.new(resourceThresholdViolatedRecord))) rescue => errorStr @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) @@ -408,6 +496,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = Constants::DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -433,6 +522,12 @@ def getContainerResourceUtilizationThresholds pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat end + + jobCompletionTimeThreshold = ENV["AZMON_ALERT_JOB_COMPLETION_TIME_THRESHOLD"] + if !jobCompletionTimeThreshold.nil? && !jobCompletionTimeThreshold.empty? + jobCompletionTimeThresholdInt = jobCompletionTimeThreshold.to_i + metric_threshold_hash[Constants::JOB_COMPLETION_TIME] = jobCompletionTimeThresholdInt + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index cf41900dc..906019b95 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -53,6 +53,10 @@ class Constants MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" + MDM_CONTAINER_CPU_THRESHOLD_VIOLATED_METRIC = "cpuThresholdViolated" + MDM_CONTAINER_MEMORY_RSS_THRESHOLD_VIOLATED_METRIC = "memoryRssThresholdViolated" + MDM_CONTAINER_MEMORY_WORKING_SET_THRESHOLD_VIOLATED_METRIC = "memoryWorkingSetThresholdViolated" + MDM_PV_THRESHOLD_VIOLATED_METRIC = "pvUsageThresholdViolated" MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" @@ -65,21 +69,22 @@ class Constants MEMORY_WORKING_SET_BYTES = "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" PV_USED_BYTES = "pvUsedBytes" + JOB_COMPLETION_TIME = "completedJobTimeMinutes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 + DEFAULT_MDM_JOB_COMPLETED_TIME_THRESHOLD_MINUTES = 360 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" - STALE_JOB_TIME_IN_MINUTES = 360 TELEGRAF_DISK_METRICS = "container.azm.ms/disk" OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" VOLUME_NAME_ZERO_FILL = "-" - PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", - "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", - "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] + PV_TYPES = ["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", + "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 8d7e729c8..659e3000c 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -9,6 +9,7 @@ module Fluent require_relative "CustomMetricsUtils" require_relative "kubelet_utils" require_relative "MdmMetricsGenerator" + require_relative "in_kube_nodes" class CAdvisor2MdmFilter < Filter Fluent::Plugin.register_filter("filter_cadvisor2mdm", self) @@ -23,6 +24,7 @@ class CAdvisor2MdmFilter < Filter @metrics_to_collect_hash = {} @@metric_threshold_hash = {} + @@controller_type = "" def initialize super @@ -63,6 +65,7 @@ def start @containerResourceDimensionHash = {} @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds + @NodeCache = Fluent::NodeStatsCache.new() end rescue => e @log.info "Error initializing plugin #{e}" @@ -161,19 +164,40 @@ def filter(tag, time, record) if counter_name == Constants::CPU_USAGE_NANO_CORES metric_name = Constants::CPU_USAGE_MILLI_CORES metric_value /= 1000000 #cadvisor record is in nanocores. Convert to mc - @log.info "Metric_value: #{metric_value} CPU Capacity #{@cpu_capacity}" - if @cpu_capacity != 0.0 - percentage_metric_value = (metric_value) * 100 / @cpu_capacity + if @@controller_type.downcase == "replicaset" + target_node_cpu_capacity_mc = @NodeCache.cpu.get_capacity(record["DataItems"][0]["Host"]) / 1000000 + else + target_node_cpu_capacity_mc = @cpu_capacity + end + @log.info "Metric_value: #{metric_value} CPU Capacity #{target_node_cpu_capacity_mc}" + if target_node_cpu_capacity_mc != 0.0 + percentage_metric_value = (metric_value) * 100 / target_node_cpu_capacity_mc end end if counter_name.start_with?("memory") metric_name = counter_name - if @memory_capacity != 0.0 - percentage_metric_value = metric_value * 100 / @memory_capacity + if @@controller_type.downcase == "replicaset" + target_node_mem_capacity = @NodeCache.mem.get_capacity(record["DataItems"][0]["Host"]) + else + target_node_mem_capacity = @memory_capacity + end + @log.info "Metric_value: #{metric_value} Memory Capacity #{target_node_mem_capacity}" + if target_node_mem_capacity != 0.0 + percentage_metric_value = metric_value * 100 / target_node_mem_capacity end + end + @log.info "percentage_metric_value for metric: #{metric_name} for instance: #{record["DataItems"][0]["Host"]} percentage: #{percentage_metric_value}" + + # do some sanity checking. Do we want this? + if percentage_metric_value > 100.0 or percentage_metric_value < 0.0 + telemetryProperties = {} + telemetryProperties["Computer"] = record["DataItems"][0]["Host"] + telemetryProperties["MetricName"] = metric_name + telemetryProperties["MetricPercentageValue"] = percentage_metric_value + ApplicationInsightsUtility.sendCustomEvent("ErrorPercentageOutOfBounds", telemetryProperties) end - # return get_metric_records(record, metric_name, metric_value, percentage_metric_value) + return MdmMetricsGenerator.getNodeResourceMetricRecords(record, metric_name, metric_value, percentage_metric_value) elsif object_name == Constants::OBJECT_NAME_K8S_CONTAINER && @metrics_to_collect_hash.key?(counter_name.downcase) instanceName = record["DataItems"][0]["InstanceName"] @@ -279,8 +303,8 @@ def ensure_cpu_memory_capacity_set return end - controller_type = ENV["CONTROLLER_TYPE"] - if controller_type.downcase == "replicaset" + @@controller_type = ENV["CONTROLLER_TYPE"] + if @@controller_type.downcase == "replicaset" @log.info "ensure_cpu_memory_capacity_set @cpu_capacity #{@cpu_capacity} @memory_capacity #{@memory_capacity}" begin @@ -306,7 +330,7 @@ def ensure_cpu_memory_capacity_set @log.info "Error getting memory_capacity" end end - elsif controller_type.downcase == "daemonset" + elsif @@controller_type.downcase == "daemonset" capacity_from_kubelet = KubeletUtils.get_node_capacity # Error handling in case /metrics/cadvsior endpoint fails diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index c057f7c2c..99e804302 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -9,6 +9,7 @@ class Kube_nodeInventory_Input < Input @@MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" + @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" @@kubeperfTag = "oms.api.KubePerf" @@ -42,6 +43,8 @@ def initialize @nodeInventoryE2EProcessingLatencyMs = 0 @nodesAPIE2ELatencyMs = 0 require_relative "constants" + + @NodeCache = NodeStatsCache.new() end config_param :run_interval, :time, :default => 60 @@ -196,6 +199,15 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end end + # Only CPU and Memory capacity for windows nodes get added to the cache (at end of file) + is_windows_node = false + if !item["status"].nil? && !item["status"]["nodeInfo"].nil? && !item["status"]["nodeInfo"]["operatingSystem"].nil? + operatingSystem = item["status"]["nodeInfo"]["operatingSystem"] + if (operatingSystem.is_a?(String) && operatingSystem.casecmp("windows") == 0) + is_windows_node = true + end + end + # node metrics records nodeMetricRecords = [] nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "allocatable", "cpu", "cpuAllocatableNanoCores", batchTime) @@ -209,10 +221,18 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "cpu", "cpuCapacityNanoCores", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) + # add data to the cache so filter_cadvisor2mdm.rb can use it + if is_windows_node + @NodeCache.cpu.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + end end nodeMetricRecord = KubernetesApiClient.parseNodeLimitsFromNodeItem(item, "capacity", "memory", "memoryCapacityBytes", batchTime) if !nodeMetricRecord.nil? && !nodeMetricRecord.empty? nodeMetricRecords.push(nodeMetricRecord) + # add data to the cache so filter_cadvisor2mdm.rb can use it + if is_windows_node + @NodeCache.mem.set_capacity(nodeMetricRecord["DataItems"][0]["Host"], nodeMetricRecord["DataItems"][0]["Collections"][0]["Value"]) + end end nodeMetricRecords.each do |metricRecord| metricRecord["DataType"] = "LINUX_PERF_BLOB" @@ -301,6 +321,9 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) properties["rsPromMonPodsNs"] = @@rsPromMonitorPodsNamespaceLength properties["rsPromMonPodsLabelSelectorLength"] = @@rsPromMonitorPodsLabelSelectorLength properties["rsPromMonPodsFieldSelectorLength"] = @@rsPromMonitorPodsFieldSelectorLength + end + # telemetry about osm metric settings for replicaset + if (File.file?(@@osmConfigMountPath)) properties["osmNamespaceCount"] = @@osmNamespaceCount end ApplicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) @@ -492,4 +515,77 @@ def getNodeTelemetryProps(item) return properties end end # Kube_Node_Input + + + class NodeStatsCache + # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) + # (to reduce code duplication) + class NodeCache + + @@RECORD_TIME_TO_LIVE = 60*20 # units are seconds, so clear the cache every 20 minutes. + + def initialize + @cacheHash = {} + @timeAdded = {} # records when an entry was last added + @lock = Mutex.new + @lastCacheClearTime = 0 + + @cacheHash.default = 0.0 + @lastCacheClearTime = DateTime.now.to_time.to_i + end + + def get_capacity(node_name) + @lock.synchronize do + retval = @cacheHash[node_name] + return retval + end + end + + def set_capacity(host, val) + # check here if the cache has not been cleaned in a while. This way calling code doesn't have to remember to clean the cache + current_time = DateTime.now.to_time.to_i + if current_time - @lastCacheClearTime > @@RECORD_TIME_TO_LIVE + clean_cache + @lastCacheClearTime = current_time + end + + @lock.synchronize do + @cacheHash[host] = val + @timeAdded[host] = current_time + end + end + + def clean_cache() + $log.info "in_kube_nodes::clean_cache: cleaning node cpu/mem cache" + cacheClearTime = DateTime.now.to_time.to_i + @lock.synchronize do + nodes_to_remove = [] # first make a list of nodes to remove, then remove them. This intermediate + # list is used so that we aren't modifying a hash while iterating through it. + @cacheHash.each do |key, val| + if cacheClearTime - @timeAdded[key] > @@RECORD_TIME_TO_LIVE + nodes_to_remove.append(key) + end + end + + nodes_to_remove.each do node_name + @cacheHash.delete(node_name) + @timeAdded.delete(node_name) + end + end + end + end # NodeCache + + + @@cpuCache = NodeCache.new + @@memCache = NodeCache.new + + def cpu() + return @@cpuCache + end + + def mem() + return @@memCache + end + end + end # module diff --git a/source/plugins/ruby/podinventory_to_mdm.rb b/source/plugins/ruby/podinventory_to_mdm.rb index 77370e284..d9cb71bd4 100644 --- a/source/plugins/ruby/podinventory_to_mdm.rb +++ b/source/plugins/ruby/podinventory_to_mdm.rb @@ -88,6 +88,7 @@ def initialize() @pod_count_by_phase = {} @pod_uids = {} @process_incoming_stream = CustomMetricsUtils.check_custom_metrics_availability + @metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @log.debug { "Starting podinventory_to_mdm plugin" } end @@ -259,7 +260,7 @@ def process_record_for_terminated_job_metric(podControllerNameDimValue, podNames if !containerFinishedTime.nil? && !containerFinishedTime.empty? finishedTimeParsed = Time.parse(containerFinishedTime) # Check to see if job was completed 6 hours ago/STALE_JOB_TIME_IN_MINUTES - if ((Time.now - finishedTimeParsed) / 60) > Constants::STALE_JOB_TIME_IN_MINUTES + if ((Time.now - finishedTimeParsed) / 60) > @metric_threshold_hash[Constants::JOB_COMPLETION_TIME] MdmMetricsGenerator.generateStaleJobCountMetrics(podControllerNameDimValue, podNamespaceDimValue) end